def test_normalize(): # Test normalize function # Only tests functionality not used by the tests for Normalizer. X = np.random.RandomState(37).randn(3, 2) assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T) assert_raises(ValueError, normalize, [[0]], axis=2) assert_raises(ValueError, normalize, [[0]], norm='l3')
def test_normalize(): """Test normalize function""" # Only tests functionality not used by the tests for Normalizer. X = np.random.RandomState(37).randn(3, 2) assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T) assert_raises(ValueError, normalize, [[0]], axis=2) assert_raises(ValueError, normalize, [[0]], norm='l3')
def build_inputs(files_list, accel_labels, file_label_dict): X_seq = [] y_seq = [] labels = [] if(os.path.isfile(rootFolder + "experim.file")): with open(rootFolder + "experim.file", "rb") as f: dump = pickle.load(f) return dump[0], dump[1], dump[2] else: for path in files_list: raw_data, target, target_label = get_row_data(path, accel_labels, file_label_dict) raw_data, indx = get_features(raw_data, path) tmp = pd.DataFrame(normalize(raw_data, axis=0, norm='max')) tmp.columns = raw_data.columns tmp = tmp[['mean', 'skew', 'standard deviation']] processedFeatures = np.array(tmp) for inputs in range(len(processedFeatures)): X_seq.append(processedFeatures[inputs]) y_seq.append(list(target)) labels.append(target_label) X_ = pd.DataFrame(X_seq) y_ = pd.DataFrame(y_seq) labels = pd.DataFrame(labels) with open(rootFolder + "experim.file", "wb") as f: pickle.dump([X_, y_, labels], f, pickle.HIGHEST_PROTOCOL) return X_, y_, labels
def build_inputs(files_list, accel_labels, file_label_dict): X_seq = [] y_seq = [] labels = [] if (os.path.isfile(rootFolder + "experim.file")): with open(rootFolder + "experim.file", "rb") as f: dump = pickle.load(f) return dump[0], dump[1], dump[2] else: for path in files_list: raw_data, target, target_label = get_row_data( path, accel_labels, file_label_dict) raw_data, indx = get_features(raw_data, path) tmp = pd.DataFrame(normalize(raw_data, axis=0, norm='max')) tmp.columns = raw_data.columns # tmp.to_csv(path_or_buf=path + "Normalized.csv", sep=',', # na_rep='', float_format=None, columns=None, header=True, # index=True, index_label=None, mode='w', encoding=None, # compression=None, quoting=None, quotechar='"', line_terminator='\n', # chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, # escapechar=None) # tmp = pd.DataFrame(columns=[]) tmp = tmp[['mean', 'skew', 'standard deviation']] processedFeatures = vectorize(tmp) for inputs in range(len(processedFeatures)): X_seq.append(np.array(processedFeatures[inputs])) y_seq.append(list(target)) labels.append(target_label) X_ = np.array(X_seq) y_ = np.array(y_seq) labels = np.array(labels) with open(rootFolder + "experim.file", "wb") as f: pickle.dump([X_, y_, labels], f, pickle.HIGHEST_PROTOCOL) return X_, y_, labels
def query_tf_idf(self, query, do_idf=True, force=False, smooth=False, proba=False, base=10, norm=None, type_tf=1): ''' Converts Index to tf.idf values do_idf: if False, convert to tf only type_idf: IDF: Default(Inverted Frequency) SIDF: Smooth Inverted Frequency PIDF: Probabilistic Inverted Frequency ''' N = len(self.documents) M = len(self.index) matrix = np.zeros((1, M)) if len(query.query_vector) == 0: raise "Not query preprocessor" if (len(query.query_score) == 0): terms = nltk.FreqDist(query.query_vector) do_idf = True type_tf = 1 else: #do_idf = True #type_tf = 1 terms = dict(zip(query.query_vector, query.query_score)) #norm = None print(dict(terms)) for (term, tf) in terms.items(): j = self.get_feature_id(term) if (j): if do_idf: ni = len(self.index[term]) tf = self.tf(tf, base=base, type=type_tf) idf = self.idf(N, ni, unary=not do_idf, smooth=smooth, proba=proba, base=base) matrix[0][j] = tf * idf else: matrix[0][j] = tf # else: # print(term) # matrix[0][j] = 0 if norm: matrix = normalize(matrix, norm=norm, copy=False) matrix = csr_matrix(matrix) return matrix
def tf_idf(self, do_idf=True, force=False, smooth=False, proba=False, base=10, norm='l2', type_tf=1): ''' Converts Index to tf.idf values do_idf: if False, convert to tf only type_idf: IDF: Default(Inverted Frequency) SIDF: Smooth Inverted Frequency PIDF: Probabilistic Inverted Frequency ''' if self.matrix == None or force: N = len(self.documents) M = len(self.index) matrix = np.zeros((N, M)) for j, term in self.feature_names.items(): docs = self.index[term] ni = len(docs) for doc in docs: i = doc.uid tf = self.tf(doc.tf, base=base, type=type_tf) idf = self.idf(N, ni, unary=not do_idf, smooth=smooth, proba=proba, base=base) matrix[i][j] = tf * idf if norm and len(matrix) > 0: matrix = normalize(matrix, norm=norm, copy=False) self.matrix = csr_matrix(matrix) #self.comatrix() return self.matrix
trainNewsItems = SenetimentNewsItemCollection("data/train-json.txt") trainingData = numpy.asarray([ doc2vecModel.infer_vector(newsItem.getWords()) for newsItem in trainNewsItems.get_news_items() ]) trainingLabels = [ newsItem.getSentiment() for newsItem in trainNewsItems.get_news_items() ] multilabelbinarizer = MultiLabelBinarizer() multilabelbinarizer.fit([['positive', 'negative']]) testData = numpy.asarray([ doc2vecModel.infer_vector(newsItem.getWords()) for newsItem in testNewsItems.get_news_items() ]) testLabels = [ newsItem.getSentiment() for newsItem in testNewsItems.get_news_items() ] trainingData = normalize(trainingData) testData = normalize(testData) lr = SGDClassifier(loss='log', penalty='l1') lr.fit(trainingData, trainingLabels) lr.predict(testData) with open('model/sentiment-classifier', 'wb') as fid: pickle.dump(lr, fid) print('Test Accuracy: %.2f' % lr.score(testData, testLabels))
csrtrain = parsetrain2sparse(train, dictlist) print('parsetrain2sparse finish %s' % datetime.now()) csrtest = parsetest2sparse(test, dictlist) print('parsetest2sparse finish %s' % datetime.now()) dump('csrtrain', csrtrain) dump('csrtest', csrtest) ''' ------------------------------华丽的sparse分割线-----------------------------------''' # 使用这里的代码替代上面的代码,直接读取sparse矩阵 # with open('csrtrain_pickle', 'rb') as f: # float16 # csrtrain = pickle.load(f) # with open('csrtest_pickle', 'rb') as f: # csrtest = pickle.load(f) # csrtrain= csrtrain.astype(np.float64) ''' ------------------------------华丽的sparse分割线-----------------------------------''' csctrainnor = normalize(csrtrain, norm='l2', axis=0) csrtrain = csctrainnor.tocsr() # csrtrain = csrtrain.astype(np.float32) # csrtest = csrtest.astype(np.float32) csrcosine = csrtest.dot(csrtrain) # 矩阵乘法 print('dot finish %s' % datetime.now()) # sim_mat = [list(row) for row in csrcosine.toarray()] # 该代码占用内存太大,直接死机 # sim_mat = similar_mat(test.userlist, train.userlist) # 计算匹配度矩阵 # with open(os.path.join(BasePath, 'temp/sim_mat'), 'wb') as file: # 将匹配矩阵存起来,因为该数据很重要 # pickle.dump(sim_mat, file) print('start knn') knn_mat = similar2knn(csrcosine, 50) # 通过匹配矩阵来计算匹配用户 step4 = datetime.now() print(step4)
def fit(self, k=250, shrink=100, alpha=None, beta=None, gamma=None, omega=None): self.k = k self.shrink = shrink # Check the parameters for the tuning scenerio if alpha is not None: self.alpha = alpha if beta is not None: self.beta = beta if gamma is not None: self.gamma = gamma if omega is not None: self.omega = omega print( "Sequential Random Hybrid Recommender mark 2: Model fitting begins" ) # Calculate all the Similarity Matrices One by one # URM tfidf --> 50446 x 50446 self.sim_URM_tfidf = Similarity_old(self.URM_train_tfidf.T, shrink=0, verbose=self.verbose, neighbourhood=200, mode=self.similarity_mode, normalize=self.normalize) # ICM tfidf --> 20635 x 20635 self.sim_ICM_tfidf = Similarity_old(self.ICM.T, shrink=0, verbose=self.verbose, neighbourhood=25, mode=self.similarity_mode, normalize=self.normalize) # URM.T tfidf --> 20635 x 20635 self.sim_URM_T_tfidf = Similarity_old(self.URM_train_tfidf, shrink=10, verbose=self.verbose, neighbourhood=350, mode=self.similarity_mode, normalize=self.normalize) # Slim --> 20635 x 20635 self.sim_Slim_item = Slim_BPR_Recommender_Cython(self.URM_train) self.sim_Slim_user = Slim_BPR_Recommender_Cython(self.URM_train.T) if self.sparse_weights: # URM self.W_sparse_URM = normalize( self.sim_URM_tfidf.compute_similarity(), axis=1, norm="l2") # ICM self.W_sparse_ICM = normalize( self.sim_ICM_tfidf.compute_similarity(), axis=1, norm="l2") # URM_T self.W_sparse_URM_T = normalize( self.sim_URM_T_tfidf.compute_similarity(), axis=1, norm="l2") # Slim self.W_sparse_Slim_item = normalize(self.sim_Slim_item.fit( lambda_i=0.37142857, lambda_j=0.97857143, learning_rate=0.001, epochs=30), axis=1, norm="l2") # Slim_T self.W_sparse_Slim_user = normalize(self.sim_Slim_user.fit( lambda_i=1, lambda_j=1, learning_rate=0.001, epochs=30), axis=1, norm="l2") # add the parameters for the logging self.parameters = "sparse_weights= {0}, verbose= {1}, similarity= {2},shrink= {3}, neighbourhood={4},normalize= {5}, alpha= {6}, beta={7}, gamma={8}, omega={9}".format( self.sparse_weights, self.verbose, self.similarity_mode, self.shrink, self.k, self.normalize, self.alpha, self.beta, self.gamma, self.omega)
def Normalize(matrix: csr_matrix): return normalize(matrix, axis=1)
#gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) vals = img.mean(axis=1).flatten() hist = np.histogram(vals, range(40, 121)) dataset_features[idx, :] = hist[0] outputs[idx] = class_label idx += 1 class_label += 1 TRAIN_SIZE = 0.8 # Разделение данных на обучающую и контрольную части в пропорции 70/30% from sklearn.model_selection import train_test_split from sklearn.preprocessing.data import normalize y = outputs X = dataset_features X = normalize(X) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, random_state=0, shuffle=True) HIDDEN_NEURONS_NUM = 100 # Количество нейронов, содержащееся в скрытом слое сети MAX_EPOCHS = 100 # Максимальное число итераций алгоритма оптимизации параметров сети np.random.seed(0) # Конвертация данных в структуру ClassificationDataSet # Обучающая часть ds_train = ClassificationDataSet(np.shape(X)[1], nb_classes=len(np.unique(y_train)))
topic_sim3 = base_net(sim3_in) topic_nonsim1 = base_net(nonsim1_in) dist1 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([topic_main, topic_sim1]) dist2 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([topic_main, topic_sim2]) dist3 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([topic_main, topic_sim3]) dist_non = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)( [topic_main, topic_nonsim1]) decoder = get_decoder_net(hidden_dim, doc_dim) reconstruction = decoder(topic_main) model = Model(input=[main_in, sim1_in, sim2_in, sim3_in, nonsim1_in], output=[reconstruction, dist1, dist2, dist3, dist_non]) model.compile('sgd', loss=['mse'] + [contrastive_loss] * 4) tf_idf = tf_idf.toarray() tf_idf = normalize(tf_idf, copy=False) + 1e-5 x_train = [ tf_idf[main_keys], tf_idf[sim1_data], tf_idf[sim2_data], tf_idf[sim3_data], tf_idf[non_sim_data] ] z = np.zeros_like(main_keys) output = [tf_idf[main_keys], z, z, z, np.ones_like(main_keys)] model.fit(x_train, output)
topic_sim2 = base_net(sim2_in) topic_sim3 = base_net(sim3_in) topic_nonsim1 = base_net(nonsim1_in) dist1 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)( [topic_main, topic_sim1]) dist2 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)( [topic_main, topic_sim2]) dist3 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)( [topic_main, topic_sim3]) dist_non = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)( [topic_main, topic_nonsim1]) decoder = get_decoder_net(hidden_dim, doc_dim) reconstruction = decoder(topic_main) model = Model(input=[main_in, sim1_in, sim2_in, sim3_in, nonsim1_in], output=[reconstruction, dist1, dist2, dist3, dist_non]) model.compile('sgd', loss=['mse'] + [contrastive_loss]*4) tf_idf = tf_idf.toarray() tf_idf = normalize(tf_idf, copy=False) + 1e-5 x_train = [tf_idf[main_keys], tf_idf[sim1_data], tf_idf[sim2_data], tf_idf[sim3_data], tf_idf[non_sim_data]] z = np.zeros_like(main_keys) output = [tf_idf[main_keys], z, z, z, np.ones_like(main_keys)] model.fit(x_train, output)