def nmf_old(mut_final, mut_diff, mut_mean_qn, mut_median_qn, n_components, init='nndsvdar', random_state=0): # fit followed by transform model = NMF(n_components=n_components, init=init, random_state=random_state) # TODO en boucle model.fit(mut_final) gene_comp = model.components_.copy() patient_strat = np.argmax(model.transform(mut_final), axis=1).copy() model.fit(mut_diff) gene_comp_diff = model.components_.copy() patient_strat_diff = np.argmax( model.transform(mut_diff), axis=1).copy() model.fit(mut_mean_qn) gene_comp_mean_qn = model.components_.copy() patient_strat_mean_qn = np.argmax( model.transform(mut_mean_qn), axis=1).copy() model.fit(mut_median_qn) gene_comp_median_qn = model.components_.copy() patient_strat_median_qn = np.argmax( model.transform(mut_median_qn), axis=1).copy() return (gene_comp, patient_strat, gene_comp_diff, patient_strat_diff, gene_comp_mean_qn, patient_strat_mean_qn, gene_comp_median_qn, patient_strat_median_qn)
class TopicEmbeddingModel(): ''' Wrapper class for different topic models ''' def __init__(self,folder='model',modeltype='kpca',topics=10): # the classifier, which also contains the trained BoW transformer self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf']) self.folder = folder self.modeltype = modeltype self.topics = topics if self.modeltype is 'kpca': from sklearn.decomposition import KernelPCA self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics) if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics) def fit(self,X): ''' fits a topic model INPUT X list of strings ''' # transform list of strings into sparse BoW matrix X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].fit_transform(\ # self.bow['count_vectorizer'].fit_transform(X)) # depending on the model, train if self.modeltype is 'kpca': Xc = self.model.fit_transform(X) if self.modeltype is 'nmf': Xc = self.model.fit_transform(X) def predict(self,X): ''' predicts cluster assignment from list of strings INPUT X list of strings ''' if X is not list: X = [X] X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].transform(\ # self.bow['count_vectorizer'].transform(X)) if self.modeltype is 'kpca': return self.model.transform(X) if self.modeltype is 'nmf': return self.model.transform(X)
def test_nmf_transform_custom_init(): # Smoke test that checks if NMF.transform works with custom initialization A = np.abs(random_state.randn(6, 5)) n_components = 4 avg = np.sqrt(A.mean() / n_components) H_init = np.abs(avg * random_state.randn(n_components, 5)) W_init = np.abs(avg * random_state.randn(6, n_components)) m = NMF(solver="cd", n_components=n_components, init="custom", random_state=0) m.fit_transform(A, W=W_init, H=H_init) m.transform(A)
class MatrixFactorization: def __init__(self): self.nmf = NMF() def fit(self, X): self.nmf.fit(X) u = self.nmf.transform(X) return u.dot(self.nmf.components_) def predict(self, X): u = self.nmf.transform(X) return u.dot(self.nmf.components_)
def applyNMF(self, number_of_clusters, country_specific_tweets): train, feature_names = self.extractFeatures(country_specific_tweets,False) name = "nmf" # Fit the NMF model if self.results: print("Fitting the NMF model", end=" - ") t0 = time() nmf = NMF(n_components=number_of_clusters, random_state=1, alpha=.1, l1_ratio=.5).fit(train) if self.results: print("done in %0.3fs." % (time() - t0)) if self.results: print("\nNMF:") parameters = nmf.get_params() if self.results: print("Parameter: " + str(parameters)) topics = nmf.components_ doc_topic = nmf.transform(train) top10, labels = self.printTopicCluster(topics, doc_topic, feature_names) labels = numpy.asarray(labels) if self.results: print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels))) return name, parameters, top10, labels
def tfidf_nmf(release_texts, n_components=10, max_features=None): ''' Creates and fits tfidf and NMF models. INPUT: - n_components: number of latent features for the NMF model to find - max_features: max number of features (vocabulary size) for the tfidf model to consider OUTPUT: - tfidf_vectorizer: tfidf model object - tfidf_sparse:tfidf sparse matrix - nmf: NMF model object - W: Feature matrix output from NMF factorization into W and H matrices ''' # tfidf model custom_stop_words = make_stop_words() tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, max_features=max_features) tfidf_sparse = tfidf_vectorizer.fit_transform(release_texts) # normalize row-wise so each row sums to one tfidf_sparse = normalize(tfidf_sparse, axis=1, norm='l1') # nmf model nmf = NMF(n_components=n_components, random_state=1) nmf.fit(tfidf_sparse) W = nmf.transform(tfidf_sparse) return tfidf_vectorizer, tfidf_sparse, nmf, W
def plot_nmf_illustration(): rnd = np.random.RandomState(5) X_ = rnd.normal(size=(300, 2)) # Add 8 to make sure every point lies in the positive part of the space X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8 nmf = NMF(random_state=0) nmf.fit(X_blob) X_nmf = nmf.transform(X_blob) fig, axes = plt.subplots(1, 2, figsize=(15, 5)) axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis') axes[0].set_xlabel("feature 1") axes[0].set_ylabel("feature 2") axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, head_width=.3, color='k') axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1, head_width=.3, color='k') axes[0].set_aspect('equal') axes[0].set_title("NMF with two components") # second plot nmf = NMF(random_state=0, n_components=1) nmf.fit(X_blob) axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, s=60, cmap='viridis') axes[1].set_xlabel("feature 1") axes[1].set_ylabel("feature 2") axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, head_width=.3, color='k') axes[1].set_aspect('equal') axes[1].set_title("NMF with one component")
def test_nmf_transform(): # Test that NMF.transform returns close values A = np.abs(random_state.randn(6, 5)) m = NMF(n_components=4, init="nndsvd", random_state=0) ft = m.fit_transform(A) t = m.transform(A) assert_array_almost_equal(ft, t, decimal=2)
class NMFReducer(): def __init__(self, dataset, dataset_name, num_components=10): self.dataset = dataset self.dataset_name = dataset_name self.labels = dataset.target self.scaler = MinMaxScaler() self.data = self.scaler.fit_transform(dataset.data) self.n_samples, self.n_features = self.data.shape self.reducer = NMF(n_components=num_components, max_iter=5000) def reduce(self): self.reducer.fit(self.data) self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data)) return self.reduced def benchmark(self, estimator, name, data): t0 = time() sample_size = 300 labels = self.labels estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) def display_reduced_digits(self): sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def display_reduced_iris(self): sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def reduce_crossvalidation_set(self, X_train, X_test): self.reducer.fit(X_train) reduced_X_train = self.scaler.transform(X_train) reduced_X_test = self.scaler.transform(X_test) return reduced_X_train, reduced_X_test
def test_nmf_transform(): # Test that NMF.transform returns close values A = np.abs(random_state.randn(6, 5)) for solver in ('pg', 'cd'): m = NMF(solver=solver, n_components=4, init='nndsvd', random_state=0) ft = m.fit_transform(A) t = m.transform(A) assert_array_almost_equal(ft, t, decimal=2)
def test(cls, csv, K=3, dr='PCA'): ''' csv - A csv file without header. ''' from sklearn.decomposition import PCA, NMF from sklearn.random_projection import GaussianRandomProjection from sklearn.manifold import MDS, TSNE from sklearn.cluster import KMeans from sklearn.preprocessing import OneHotEncoder X = pd.read_csv(csv, header=None).values Z = None Xr = None if (dr == 'PCA'): pca = PCA(n_components=K) # keep the first K components pca.fit(X) Z = pca.transform(X) Xr = pca.inverse_transform(Z) elif (dr == 'NMF'): # make sure X is non-negative Xmin = np.min(X) if (Xmin < 0): X = X - Xmin nmf = NMF(n_components=K) # keep the first K components nmf.fit(X) Z = nmf.transform(X) Xr = nmf.inverse_transform(Z) if (Xmin < 0): Xr = Xr + Xmin elif (dr == 'RP'): grp = GaussianRandomProjection( n_components=K) # keep the first K components Z = grp.fit_transform(X) elif (dr == 'VQ'): kmeans = KMeans(n_clusters=K).fit(X) Xvq = kmeans.predict(X) H = kmeans.cluster_centers_ ohe = OneHotEncoder() Z = ohe.fit_transform(Xvq.reshape(-1, 1)).A Xr = Z @ H elif (dr == 'MDS'): mds = MDS(n_components=K) # keep the first K components Z = mds.fit_transform(X) elif (dr == 'TSNE'): tsne = MDS(n_components=K) # keep the first K components Z = tsne.fit_transform(X) elif (dr == 'IDENTITY'): # for this case, k is not used. Z = X Xr = X else: raise Exception("Invalid DR name") return cls(X, Z, Xr)
def MF(filename, K): data_matrix = np.loadtxt(str(filename), delimiter = " "); dimen = data_matrix.shape; num_row = int(dimen[0]); num_col = int(dimen[1]); f_matrix = data_matrix[0:num_row, 0:num_col-2]; class_label = np.array(data_matrix[0:num_row, num_col-2:num_col-1]); label_list = map(int, class_label.T.tolist()[0]); year_infor = np.array(data_matrix[0:num_row, num_col-1:num_col]); year_list = map(int, year_infor.T.tolist()[0]); # do NMF on f_matrix model = NMF(n_components = K, random_state = None); model.fit(f_matrix); W_matrix = model.transform(f_matrix); transformed_matrix = np.hstack((W_matrix, class_label)).tolist(); # group the paper based on temporal information year_datalistlist_dict = {}; for index in range(0,len(year_list)): if year_datalistlist_dict.get(int(year_list[index]), -1) == -1: datalistlist = []; datalistlist.append(transformed_matrix[index]); year_datalistlist_dict[year_list[index]] = datalistlist; else: datalistlist = year_datalistlist_dict[year_list[index]] datalistlist.append(transformed_matrix[index]); year_datalistlist_dict[year_list[index]] = datalistlist; dimen = K; sorted_year_list = sorted(year_datalistlist_dict.keys()); train_set_list = []; train_label_list = []; test_set_list = []; test_label_list = []; test_year_list = sorted_year_list[len(sorted_year_list)-4:]; # training and test partition based on the temporal information and put more recent papers into test set for k, v in year_datalistlist_dict.items(): if int(k) not in test_year_list: # put all the corresponding paper into training set for train in range(0,len(v)): train_set_list.append(v[train][0:dimen]); train_label_list.append(int(v[train][-1])); else: # put all the corresponding paper into test set for test in range(0,len(v)): test_set_list.append(v[test][0:dimen]); test_label_list.append(int(v[test][-1])); num_cluster_test = len(list(set(test_label_list))); return [train_set_list, train_label_list, test_set_list, test_label_list, num_cluster_test];
def get_first_nmf_component(X): corr_matrix = np.dot(X.T, X) / (X.shape[0] - 1) nmf = NMF(n_components=1) nmf.fit(corr_matrix) print("\t\tReconstruction error: {:.2f}".format( nmf.reconstruction_err_)) print("\t\tNumber of iterations: {}".format(nmf.n_iter_)) return nmf.transform(corr_matrix)
def test_nmf_transform(solver): # Test that NMF.transform returns close values rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(6, 5)) m = NMF(solver=solver, n_components=3, init='random', random_state=0, tol=1e-5) ft = m.fit_transform(A) t = m.transform(A) assert_array_almost_equal(ft, t, decimal=2)
def NMF_train(X_train, X_test, n): nmf_model = NMF(n_components=n) nmf_model.fit(X_train) H = nmf_model.components_; W = nmf_model.fit_transform(X_train) W_test = nmf_model.transform(X_test) return H, W, W_test
def get_factorization(V, num_roles): """ Obtains a nonnegative matrix factorization of the matrix V with num_roles intermediate roles. """ model = NMF(n_components=num_roles, init='random', random_state=0) model.fit(V) node_roles = model.transform(V) role_features = model.components_ return np.matrix(node_roles), np.matrix(role_features)
def performNMF(M, fragmentsLookupTable, fragmentsCount): if (args.verbose): print >> sys.stdout, "- %s START : calculating NMF" % (timeStamp()) t0 = time() model = NMF( n_components=args.components, init='nndsvd', beta=10000.0, max_iter=1000, tol=5e-3, sparseness='components') model.fit(M) train_time = (time() - t0) components_ = model.components_ # print >> sys.stdout, components_ N = model.transform(M) if (args.verbose): print >> sys.stdout, "- %s FINISH : calculating NMF" % (timeStamp()) if (args.verbose): print >> sys.stdout, "- %s START : mapping components" % ( timeStamp()) # convert components into locations for i in xrange(args.components): output = gzip.open( args.outdir + "/NMF_component_" + str(i) + ".txt.gz", 'wb') if (args.verbose): print >> sys.stdout, "- : processing component %d" % (i) try: for j in xrange(model.components_[i].shape[0]): # print >> sys.stdout, model.components_[i] # print >> sys.stdout, "Max value %f" (numpy.max(model.components_[i])) # if (model.components_[i][j] != 0): fragment1 = j / fragmentsCount fragment2 = j % fragmentsCount (chr1, midpoint1) = fragmentsLookupTable[fragment1] (chr2, midpoint2) = fragmentsLookupTable[fragment2] output.write( "%s\t%i\t%s\t%i\t%f\n" % (chr1, midpoint1, chr2, midpoint2, model.components_[i][j])) finally: output.close() if (args.verbose): print >> sys.stdout, "- %s FINISH : mapping components" % ( timeStamp()) return (N, model)
def nmf_model(corpus_trigrams, num_topics): processed_corpus_str = [' '.join(word) for word in corpus_trigrams] tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=5, ngram_range = (1,3), stop_words='english') tfidf = tfidf_vectorizer.fit_transform(processed_corpus_str) tfidf_feature_names = tfidf_vectorizer.get_feature_names() nmf = NMF(n_components=num_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) nmf_W = nmf.transform(tfidf) nmf_H = nmf.components_ return (nmf_H, nmf_W, tfidf_feature_names)
def get_factorization(V, num_roles): """ Obtains a nonnegative matrix factorization of the matrix V with num_roles intermediate roles. """ model = NMF(n_components=num_roles, init='random', random_state=0) model.fit(V) node_roles = model.transform(V) role_features = model.components_ return torch.from_numpy(node_roles), torch.from_numpy(role_features)
def reduce_dim_NMF(tfidf_train, tfidf_test, k): model = NMF(n_components=k, init='random') W_train = model.fit_transform(tfidf_train) H_train = model.components_ tfidf_train_hat = W_train.dot(H_train) distance_train = np.linalg.norm(tfidf_train - tfidf_train_hat, 'fro')**2 W_test = model.transform(tfidf_test) return W_train, W_test, distance_train
def test_nmf_inverse_transform(): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) m = NMF(n_components=4, init="random", random_state=0) m.fit_transform(A) t = m.transform(A) A_new = m.inverse_transform(t) assert_array_almost_equal(A, A_new, decimal=2)
def cluster_score_store(): print "start new run clustering, scoring and storing result" conn = psycopg2.connect(database='hedda', user='******') c = conn.cursor() #get last days tweets from db c.execute('''SELECT id, text, url, (favcount + retweetcount), date_time FROM tweets WHERE date_time::date > current_date - interval '1' day;''') texts = pd.DataFrame(c.fetchall()) texts.columns = ['id','text','url','sumfavrtcount', 'date_time'] texts['date_time'] = texts['date_time'].apply(pd.to_datetime) texts['hrs'] = texts['date_time'].apply(lambda x: (datetime.now()-x).total_seconds()/60/60) texts['favrt_hour'] = texts['sumfavrtcount']/texts['hrs'] #prepare vectorized text tfidfvect = TfidfVectorizer(max_features=100, max_df=0.7, min_df=.01, tokenizer=clean_tokenized_text) tfidfvect.fit(texts['text'].values) X = tfidfvect.transform(texts['text'].values) feature_names = tfidfvect.get_feature_names() nmf = NMF(n_components = 25, max_iter = 5000).fit(X) topic_labels = [] for topic_idx, topic in enumerate(nmf.components_): #print "Topic %s: %s" % (topic_idx, ' '.join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])) topic_labels.append(" ".join([feature_names[i] for i in topic.argsort()[:-3 - 1:-1]])) y_hat = nmf.transform(X) y_norm = [y/ y.sum() for y in y_hat] df_y_hat = pd.DataFrame(y_norm, columns=topic_labels, index=texts.index.values) df_y_hat = df_y_hat.fillna(0) df_nmf = texts.join(df_y_hat) #pick winning cluster for every tweet df_nmf['HighScore'] = df_nmf[topic_labels].max(axis=1) df_nmf['Cluster'] = df_nmf[topic_labels].idxmax(axis=1) #keep only high scores above 0.85 df_nmf = df_nmf[df_nmf['HighScore']>=0.85] #create cluster dataset with highest scoring tweets per cluster #based on the amount of retweets and favorites per hour idx = df_nmf.groupby(['Cluster'])['favrt_hour'].transform(max) == df_nmf['favrt_hour'] df_clus_favrt = pd.DataFrame(df_nmf[idx][['Cluster','id','date_time','text','url','HighScore','sumfavrtcount','favrt_hour']]) df_temp = df_nmf.groupby('Cluster').sum()['sumfavrtcount'].reset_index() df_temp.columns=['Cluster','total-rt-fav'] df_clus_favrt = pd.merge(df_clus_favrt, df_temp, on ='Cluster') df_clus_favrt.columns=['descr','id','date_time','tweettext','url','score', 'sum-rt-fav-toptweet', 'favrt_hour','total-rt-fav'] #export tweet URL, tweet text, tweet date and any additional information you found useful #choice: best cluster is the one with highest total retweets and favorites export = df_clus_favrt.sort('total-rt-fav', ascending=False).head(5)[['descr','url','tweettext','date_time']] export.to_csv('resultnewstweets-%s.csv' % datetime.now(), index=False) print "new file created at: %s" % datetime.now()
def run(): ''' Standard topic analysis copied from scikit learn example on http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf.html ''' t0 = time() n_topics = 100 n_top_posts = 20 n_top_words = 50 ngram = 1 dataFname = '../DSSG_unleashfootball/word_splits_stopwords' originalTexts = '../DSSG_unleashfootball/Original_posts' dat = cPickle.load(open(dataFname)) orig = cPickle.load(open(originalTexts)) vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,ngram_range=(1,ngram)) tfidf = vectorizer.fit_transform([' '.join(x) for x in dat]) nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf) print("Done fitting NMF in %0.3fs." % (time() - t0)) topic_assignments = nmf.transform(tfidf).argmax(axis=1) feature_names = vectorizer.get_feature_names() sentimentWords = json.loads(open('sentiWords.json').read())#load_sentiment() sentimentTopics = get_sentiments(nmf.components_,vectorizer,sentimentWords) sentiments = get_sentiments(tfidf,vectorizer,sentimentWords) topics = [] keywords2Topic = {} for topic_idx, topic in enumerate(nmf.components_): topicDict = {} topicDict['sentiment'] = sentimentTopics[topic_idx] topicDict['keywords'] = [{'keyword':feature_names[i],'weight':nmf.components_[topic_idx,i]} for i in topic.argsort()[:-n_top_words - 2:-1]] topicDict['label'] = topicDict['keywords'][0]['keyword'] # count number of posts in this topic topicDict['postCount'] = (topic_assignments==topic_idx).sum() # get some representative posts ranking = tfidf.dot(nmf.components_[topic_idx,:]) ranks = ranking.argsort()[-n_top_posts:][::-1] topicDict['posts'] = [] for item in ranks: topicDict['posts'].append({'post':orig[item],'relevance':ranking[item],'sentiment':sentiments[item]}) keywords2Topic = dict(keywords2Topic.items() + [ (word['keyword'],topic_idx) for word in topicDict['keywords']]) print("Topic #%d (Sentiment %f):" %(topic_idx,sentimentTopics[topic_idx])) print(" | ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) topics.append(topicDict) open('topics.json','wb').write(json.dumps(topics)) open('keywords2Topic.json','wb').write(json.dumps(keywords2Topic))
def make_NMF_300_feature(row_body_path, row_stance_path, head_tfidf_pkl, body_tfidf_pkl, label_path, save_nmf_model_path, save_head_path, save_body_path, cos_dist=False): if not os.path.exists(head_tfidf_pkl) or not os.path.exists(body_tfidf_pkl) \ or not os.path.exists(label_path): make_tfidf_feature_5000(row_body_path, row_stance_path, head_tfidf_pkl, body_tfidf_pkl, label_path, model_save=True) X_tfidf_body = load_model(body_tfidf_pkl) X_tfidf_head = load_model(head_tfidf_pkl) if not os.path.exists(save_nmf_model_path): X_all = np.concatenate( (X_tfidf_head.toarray(), X_tfidf_body.toarray()), axis=0) print('fit NMF topic model') t0 = time() nmf = NMF(n_components=300, random_state=1, alpha=.1) nmf.fit(X_all) print('done in {}'.format(time() - t0)) save_model(save_nmf_model_path, nmf) nmf = load_model(save_nmf_model_path) if not os.path.exists(save_head_path) or not os.path.exists( save_body_path): nmf_head_matrix = nmf.transform(X_tfidf_head) nmf_body_matrix = nmf.transform(X_tfidf_body) save_model(save_head_path, nmf_head_matrix) print('saved model {}'.format(save_head_path)) save_model(save_body_path, nmf_body_matrix) print('saved model {}'.format(save_body_path)) nmf_head_matrix = load_model(save_head_path) nmf_body_matrix = load_model(save_body_path) if not cos_dist: return np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1) else: X = [] for i in range(len(nmf_head_matrix)): X_head = np.array(nmf_head_matrix[i]).reshape((1, -1)) X_body = np.array(nmf_body_matrix[i]).reshape((1, -1)) cos = cosine_distances(X_head, X_body).flatten() X.append(cos.tolist()) X = np.array(X) X_train = np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1) X = np.concatenate((X_train, X), axis=1) return X
def calc_nmf(self, matrix, vocab, providers, components=None, hardclustering=True): if components is None: components = self.n_topics + self.soft_offset print("NMF: Calculating ", components, " components (topics)...") nmf = NMF(n_components=components, random_state=1, alpha=.1, l1_ratio=.5).fit(matrix) print("NMF: reconstruction error:", nmf.reconstruction_err_) # soft clustering cluster_assignments = nmf.transform(matrix) # samples x components # derive topics topics = {} for c_idx, component in enumerate(nmf.components_): # determine top terms top = component.argsort()[::-1][:self.top_n] top = top[component[top] > 0] # Store topics[c_idx] = {"terms": vocab[top], "weights": component[top]} self.__postprocess__(clusters=cluster_assignments, topics=topics, raw_data=matrix, path=self.path + "nmf/", prefix=self.prefix, soft_clustering=True, providers=providers) #cluster_assignments = self.__removeInvalid__(cluster_assignments=cluster_assignments, topics=topics) if hardclustering: print("NMF: KMeans: Calculating ", self.n_topics, " clusters (topics)...") cluster_assignments, topics = self.__applyKMeans__( raw_data=matrix, vocab=vocab, soft_clustering=cluster_assignments) print("NMF: KMeans: ", len(cluster_assignments), " cluster assignments") self.__postprocess__(clusters=cluster_assignments, topics=topics, raw_data=matrix, path=self.path + "nmf/kmeans/", prefix=self.prefix, soft_clustering=False, providers=providers) return cluster_assignments, topics
def nmf_topic_modeling (word_matrix, vocab, n = 5): nmf = NMF(n_components = n, max_iter = 1000) nmf.fit(word_matrix) topic_matrix = pd.DataFrame(nmf.transform(word_matrix)).add_prefix("topic_") word_matrix = pd.DataFrame(nmf.components_, \ columns = vocab).T.add_prefix('topic_') return nmf, nmf.reconstruction_err_, topic_matrix, word_matrix
def plot_nmf(self, X, cell, outdir): corr_matrix = np.dot(X.T, X) / (X.shape[0] - 1) nmf = NMF(n_components=2) nmf.fit(corr_matrix) projections = pd.DataFrame(nmf.transform(corr_matrix)) projections.columns = ["LF1", "LF2"] info = {"X": ("LF1", ""), "Y": ("LF2", "")} self.plot(projections, info, cell, outdir)
def matrix_factorization(co_metrix): mf = NMF(n_components=500, init='random', random_state=0, max_iter=100, alpha=0.75, eta=0.001) mf.fit(co_metrix) word_vector = mf.transform(co_metrix) return word_vector
def fit(self): nmf = NMF(**self.fit_parameters) nmf.fit(self.input_data) self.output_data = nmf.transform(self.input_data) self.mapper_data = nmf.components_ self.model_attributes = {"n_topics": nmf.n_components, } self._log_model_results() return self
def test_nmf_inverse_transform(): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) for solver in ('pg', 'cd'): m = NMF(solver=solver, n_components=4, init='random', random_state=0) m.fit_transform(A) t = m.transform(A) A_new = m.inverse_transform(t) assert_array_almost_equal(A, A_new, decimal=2)
def run_nmf(X, asins, features, n_components=20): nmf = NMF(n_components) nmf.fit(X) W = nmf.transform(X) H = nmf.components_ #make interpretable: W, H = (x for x in (W, H)) W_df = pd.DataFrame(W, index=asins) H_df = pd.DataFrame(H, columns=features) return (W, H, W_df, H_df)
def test_parameter_checking(): A = np.ones((2, 2)) name = 'spam' # FIXME : should be removed in 1.1 init = 'nndsvda' msg = "Invalid solver parameter: got 'spam' instead of one of" with pytest.raises(ValueError, match=msg): NMF(solver=name, init=init).fit(A) msg = "Invalid init parameter: got 'spam' instead of one of" with pytest.raises(ValueError, match=msg): NMF(init=name).fit(A) msg = "Invalid regularization parameter: got 'spam' instead of one of" with pytest.raises(ValueError, match=msg): NMF(regularization=name, init=init).fit(A) msg = "Invalid beta_loss parameter: got 'spam' instead of one" with pytest.raises(ValueError, match=msg): NMF(solver='mu', init=init, beta_loss=name).fit(A) msg = ( "Invalid beta_loss parameter: solver 'cd' does not handle " "beta_loss = 1.0" ) with pytest.raises(ValueError, match=msg): NMF(solver='cd', init=init, beta_loss=1.0).fit(A) msg = "Negative values in data passed to" with pytest.raises(ValueError, match=msg): NMF(init=init).fit(-A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(-A, 2, 'nndsvd') clf = NMF(2, tol=0.1, init=init).fit(A) with pytest.raises(ValueError, match=msg): clf.transform(-A) for init in ['nndsvd', 'nndsvda', 'nndsvdar']: msg = re.escape( "init = '{}' can only be used when " "n_components <= min(n_samples, n_features)" .format(init) ) with pytest.raises(ValueError, match=msg): NMF(3, init=init).fit(A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(A, 3, init)
def nmf_applied_to_wikipedia_articles(articles): # Create an NMF instance: model model = NMF(n_components=6) # Fit the model to articles model.fit(articles) # Transform the articles: nmf_features nmf_features = model.transform(articles) # Print the NMF features print(nmf_features.round(2)) return nmf_features
def test_nmf_inverse_transform(): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) for solver in ('pg', 'cd'): m = NMF(solver=solver, n_components=4, init='random', random_state=0) ft = m.fit_transform(A) t = m.transform(A) A_new = m.inverse_transform(t) assert_array_almost_equal(A, A_new, decimal=2)
def topic_dummies(df): #CLEAN HTML FUNCTION def get_text(cell): return BeautifulSoup(cell, 'html.parser').get_text() #Parse descriptions using html function above: df['description'] = df['description'].apply(get_text) df['org_desc'] = df['org_desc'].apply(get_text) clean = df['description'] #All the parameters for the topic modeling. n_samples = len(clean) n_features = 500 n_topics = 9 n_top_words = 30 my_additional_stopwords = ["la", "et", "en", "le", "les", "des", 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'friday', 'thursday', 'saturday'] stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stopwords) # Use tf-idf features for NMF. tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words=stop_words) tfidf = tfidf_vectorizer.fit_transform(clean) # Fit the NMF model nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) #Leave this turned off unless you want to print. #tfidf_feature_names = tfidf_vectorizer.get_feature_names() #print_top_words(nmf, tfidf_feature_names, n_top_words) ''' #Assign topics to descriptions: #These are from the full data. Do NOT use these descriptions on any subset, as they will not match. topic_dict = {0:'dinner_party', 1:'educational', 2:'social_networks', 3:'logistics', 4: 'business', 5:'university', 6:'club_logistics', 7:'workshop', 8:'club_content'} ''' topic_dict = {0:'topic1', 1:'topic2', 2:'topic3', 3:'topic4', 4: 'topic5', 5:'topic6', 6:'topic7', 7:'topic8', 8:'topic9'} W = nmf.transform(tfidf) df['topic_index'] = np.argmax(W, axis=1) df['topic_index'] = df['topic_index'].replace(topic_dict) ###Create dummy variables to insert into model topic_dummies = pd.get_dummies(df['topic_index']).rename(columns = lambda x: 'topic_'+str(x)) df = pd.concat([df,topic_dummies],axis=1) return df
def test_parameter_checking(): A = np.ones((2, 2)) name = "spam" # FIXME : should be removed in 1.1 init = "nndsvda" msg = "Invalid solver parameter: got 'spam' instead of one of" with pytest.raises(ValueError, match=msg): NMF(solver=name, init=init).fit(A) msg = "Invalid init parameter: got 'spam' instead of one of" with pytest.raises(ValueError, match=msg): NMF(init=name).fit(A) with ignore_warnings(category=FutureWarning): # TODO remove in 1.2 msg = "Invalid regularization parameter: got 'spam' instead of one of" with pytest.raises(ValueError, match=msg): NMF(regularization=name, init=init).fit(A) msg = "Invalid beta_loss parameter: got 'spam' instead of one" with pytest.raises(ValueError, match=msg): NMF(solver="mu", init=init, beta_loss=name).fit(A) msg = "Invalid beta_loss parameter: solver 'cd' does not handle beta_loss = 1.0" with pytest.raises(ValueError, match=msg): NMF(solver="cd", init=init, beta_loss=1.0).fit(A) msg = "Negative values in data passed to" with pytest.raises(ValueError, match=msg): NMF(init=init).fit(-A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(-A, 2, "nndsvd") clf = NMF(2, tol=0.1, init=init).fit(A) with pytest.raises(ValueError, match=msg): clf.transform(-A) for init in ["nndsvd", "nndsvda", "nndsvdar"]: msg = re.escape( "init = '{}' can only be used when " "n_components <= min(n_samples, n_features)".format(init)) with pytest.raises(ValueError, match=msg): NMF(3, init=init).fit(A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(A, 3, init)
def main(): newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') X_train = newsgroups_train.data X_test = newsgroups_test.data y_train = newsgroups_train.target y_test = newsgroups_test.target X_train, X_test = TFIDF(X_train, X_test) NMF_ = NMF(n_components=2000) X_train_new = NMF_.fit(X_train) X_train_new = NMF_.transform(X_train) X_test_new = NMF_.transform(X_test) print("train with old features: ", np.array(X_train).shape) print("train with new features:", np.array(X_train_new).shape) print("train with old features: ", np.array(X_train).shape) print("train with new features:", np.array(X_train_new).shape)
def JLL(X_train, y_train=None, X_test=None, n=100, init='random'): from sklearn.random_projection import johnson_lindenstrauss_min_dim mod = NMF(n_components=n, init=init, random_state=0) X = mod.fit(X_train, y_train) test = mod.transform(X_train) if X_test is None: out = train else: test = pca.transform(X_test) out = train, test return out
def Nmf(self,): data_set=pd.read_csv(self.data_set_name,header=None,index_col=None) data_set=data_set.T nmf=NMF(n_components=self.components) nmf.fit(data_set) data_set=nmf.transform(data_set) print("Generate Dre_data.csv." ) #print("The interpretability of each component:") data_set=pd.DataFrame(data_set) data_set.to_csv(self.Dred_data,header=False,index=False) return 0
def get_latent_vector(X): # for language: n_components=150 # for repo: n_components=? model = NMF(n_components=150, init='nndsvd', max_iter=1000, random_state=1126) print('NMF', model) model.fit(X) W = model.transform(X) H = model.components_ normalized_matrix = normalize(W, axis=1, norm='l2') return normalized_matrix
def test_sparse_transform(): # Test that transform works on sparse data. Issue #2124 A = np.abs(random_state.randn(3, 2)) A[A > 1.0] = 0 A = csc_matrix(A) model = NMF(random_state=0, tol=1e-4, n_components=2) A_fit_tr = model.fit_transform(A) A_tr = model.transform(A) assert_array_almost_equal(A_fit_tr, A_tr, decimal=1)
def reduce_dimensions(total_mat, n_topics): """ Calculates and returns nmf Input is data matrix, shape (n_samples, n_features) returns W array, shape (n_samples, n_components) """ nmf = NMF(n_components = n_topics, random_state=42, alpha=.2, l1_ratio=0.5) nmf.fit(total_mat) X = nmf.transform(total_mat) w = nmf.components_ return nmf
def nmf(df, week, questionNb, nbTopic, n, plt): """ df: dataframe contains documents/sentences week: which week that you want ? questionNb: which question in this week ? nbTopic: how many topics do you think these document have ? n: find top n sentences contribute for each topic """ print('Welcome to NMF algorithm.') print( 'Begin find topics for all answers of question number {i} of week {w}'. format(i=questionNb, w=week)) df = df[df['week'] == week] df['relevant'] = df['processed_responses'].apply( lambda x: x[questionNb - 1] if len(x) > questionNb - 1 else '') df = df.reset_index().drop('index', 1) " Non-negative Matrix Factorization is able to use tf-idf " tfidf_vectorizer = TfidfVectorizer(min_df=7, max_df=18) #(max_features=vocab_size) tfidf = tfidf_vectorizer.fit_transform(df['relevant']) tfidf_feature_names = tfidf_vectorizer.get_feature_names() print('Shape of tfidf matrix:', tfidf.shape) "NMF" nmf = NMF(n_components=nbTopic, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) print('Topic words distribution shape:', nmf.components_.shape) plot_topics(nmf, tfidf_feature_names, nbTopic, plt) sim_matrix = concepts_responses_similarity(nmf.components_, tfidf.toarray()) "Retrieve top n responses for a specific topic" d = {} for i in range(nbTopic): response_scores = sim_matrix[:, i] top_indexes = response_scores.argsort()[-n:][::-1] top_responses = [] for index in top_indexes: top_responses.append(df.iloc[index]['standardized_responses']) d['Topic #' + str(i)] = top_responses return df, pd.DataFrame.from_dict(d), nmf.transform(tfidf)
def initialize(self): # TfIdf vectors tfidf_vectors, tfidf_vectorizer = self.vectorize_as_tfidf(self.naked_docs) nmf = NMF(n_components=self.num_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf_vectors) self.vectors = tfidf_vectors self.vectorizer = tfidf_vectorizer tfidf_feature_names = tfidf_vectorizer.get_feature_names() self.topics = self.get_topics(nmf, tfidf_feature_names, self.num_topic_words) self.doc_topic_distrib = nmf.transform(tfidf_vectors) self.model = nmf
def NMF_(X_train_tfidf, X_test_tfidf): model = NMF(n_components=50, init='random', random_state=0) W_train_r = model.fit_transform(X_train_tfidf) W_test_r = model.transform(X_test_tfidf) H = model.components_ Err_NMF = 0 Err_NMF = np.sum(np.array(X_train_tfidf - W_train_r.dot(H))**2) return W_train_r, W_test_r, H, Err_NMF
def test_sparse_transform(): # Test that transform works on sparse data. Issue #2124 A = np.abs(random_state.randn(3, 2)) A[A > 1.0] = 0 A = csc_matrix(A) for solver in ('pg', 'cd'): model = NMF(solver=solver, random_state=0, tol=1e-4, n_components=2) A_fit_tr = model.fit_transform(A) A_tr = model.transform(A) assert_array_almost_equal(A_fit_tr, A_tr, decimal=1)
def main(): mat = np.zeros(shape=(0, 16 * 16 * 10)) weightsArr = [] arr = [] f = open('../data/coarse.pkl', 'rb') while True: try: scan = pickle.load(f) except: break d = scan.data arr.append(d.shape[0]) weightsArr.append(Weights(scan)) mat = np.append( mat, np.reshape(d, (d.shape[0], d.shape[1] * d.shape[2] * d.shape[3])), axis=0) if len(arr) % 20 == 0: print(len(arr)) print(mat.shape) np.save("../data/mat.npy", mat) doPCA = True doNMF = False if doPCA: pca = PCA(n_components=700, whiten=True) pca.fit(mat) print("PCA Fitted") curr = 0 for i, x in enumerate(arr): weights = pca.transform(mat[curr:(curr + x), :]) curr += x weightsArr[i].setWeights(weights) f = open('../data/pca.pkl', 'wb') pickle.dump(weightsArr, f, pickle.HIGHEST_PROTOCOL) pickle.dump(pca.components_, f, pickle.HIGHEST_PROTOCOL) if doNMF: nmf = NMF(n_components=20) nmf.fit(mat) print("NMF Fitted") curr = 0 for i, x in enumerate(arr): weights = nmf.transform(mat[curr:(curr + x), :]) curr += x weightsArr[i].setWeights(weights) f = open('../data/nmf.pkl', 'wb') pickle.dump(weightsArr, f, pickle.HIGHEST_PROTOCOL) pickle.dump(nmf.components_, f, pickle.HIGHEST_PROTOCOL)
def model_ratings_NMF(ratings, movies_ind, n_components): R = pd.DataFrame(ratings) # model assumes R ~ PQ' model = NMF(n_components=n_components, init='random', random_state=10) model.fit(R) P = model.components_ # Movie feature Q = model.transform(R) # User features query = user_ratings.reshape(1,-1) t=model.transform(query) # prediction movie ratings of input user outcome = np.dot(t,P) outcome=pd.DataFrame(outcome) outcome = outcome.transpose() outcome['movieId'] = movies_ind['movieId'] outcome = outcome.rename(columns={0:'rating'}) top = outcome.sort_values(by='rating',ascending=False).head(150) # top 100 ratings from predictions list return top
def get_features(head_and_body): filename = "NMF_topics" + str(n_topics) + "topics" if include_holdout == True: filename += "_holdout" if include_unlbled_test == True: filename += "unlbled_test" if not (os.path.exists(features_dir + "/" + filename + ".pkl")): X_all, vocab = get_all_data(head_and_body, filename) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic nfm = NMF(n_components=n_topics, random_state=1, alpha=.1) print("NMF_topics: fit and transform body") t0 = time() nfm.fit_transform(X_all) print("done in %0.3fs." % (time() - t0)) with open(features_dir + "/" + filename + ".pkl", 'wb') as handle: joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL) else: vocab = get_vocab(head_and_body, filename) with open(features_dir + "/" + filename + ".pkl", 'rb') as handle: nfm = joblib.load(handle) vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) print("NMF_topics: transform head and body") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar nfm_head_matrix = nfm.transform(X_train_head) nfm_body_matrix = nfm.transform(X_train_body) if cosinus_dist == False: return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1) else: # calculate cosine distance between the body and head X = [] for i in range(len(nfm_head_matrix)): X_head_vector = np.array(nfm_head_matrix[i]).reshape( (1, -1)) # 1d array is deprecated X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def test_nmf_sparse_transform(): # Test that transform works on sparse data. Issue #2124 rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(3, 2)) A[1, 1] = 0 A = csc_matrix(A) for solver in ('cd', 'mu'): model = NMF(solver=solver, random_state=0, n_components=2, max_iter=400) A_fit_tr = model.fit_transform(A) A_tr = model.transform(A) assert_array_almost_equal(A_fit_tr, A_tr, decimal=1)
def get_features(head_and_body): filename = "NMF_topics" + str(n_topics) + "topics" if include_holdout == True: filename += "_holdout" if include_unlbled_test == True: filename += "unlbled_test" if not (os.path.exists(features_dir + "/" + filename + ".pkl")): X_all, vocab = get_all_data(head_and_body, filename) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic nfm = NMF(n_components=n_topics, random_state=1, alpha=.1) print("NMF_topics: fit and transform body") t0 = time() nfm.fit_transform(X_all) print("done in %0.3fs." % (time() - t0)) with open(features_dir + "/" + filename + ".pkl", 'wb') as handle: joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL) else: vocab = get_vocab(head_and_body, filename) with open(features_dir + "/" + filename + ".pkl", 'rb') as handle: nfm = joblib.load(handle) vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) print("NMF_topics: transform head and body") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar nfm_head_matrix = nfm.transform(X_train_head) nfm_body_matrix = nfm.transform(X_train_body) if cosinus_dist == False: return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1) else: # calculate cosine distance between the body and head X = [] for i in range(len(nfm_head_matrix)): X_head_vector = np.array(nfm_head_matrix[i]).reshape((1, -1)) # 1d array is deprecated X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
class Factorizer(MultiScalePatches): def fit(self, data, k=5): super(Factorizer, self).fit(data) ftr = self.compute(data) self.nnmf = NMF(n_components=k).fit(ftr) return self def transform(self, data): ftr = self.compute(data) return self.nnmf.transform(ftr) def fit_transform(self, data, k=5): return self.fit(data, k=k).transform(data)
def nmf_articles(df, n_topics, n_features=5000, n_top_words=20, random_state=None, max_df=1, min_df=1): tfidf, feature_names, reverse_lookup = create_document_vector(df, max_features=n_features, max_df=max_df, min_df=min_df) nmf = NMF(n_components=n_topics, random_state=random_state, alpha=.1, l1_ratio=0.25).fit(tfidf) W = nmf.transform(tfidf) # Currently the attribution for each row in W is not a percentage, but we want to assign each document to any topic which it can be at least 10% attributed to sums = np.sum(W, axis=1) W_percent = W / sums[:, None] # For efficient slicing we will return a sparse boolean array labels = W_percent >= 0.1 words = top_words(nmf.components_, feature_names, n_top_words) return nmf, tfidf, W, W_percent, labels, words, feature_names, reverse_lookup
def test_non_negative_factorization_consistency(): # Test that the function is called in the same way, either directly # or through the NMF class A = np.abs(random_state.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 W_nmf, H, _ = non_negative_factorization(A, random_state=1, tol=1e-2) W_nmf_2, _, _ = non_negative_factorization(A, H=H, update_H=False, random_state=1, tol=1e-2) model_class = NMF(random_state=1, tol=1e-2) W_cls = model_class.fit_transform(A) W_cls_2 = model_class.transform(A) assert_array_almost_equal(W_nmf, W_cls, decimal=10) assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def nmf_faces(X_train, X_test): # Build NMF models with 10, 50, 100 and 500 components # this list will hold the back-transformd test-data reduced_images = [] for n_components in [10, 50, 100, 500]: # build the NMF model nmf = NMF(n_components=n_components, random_state=0) nmf.fit(X_train) # transform the test data (afterwards has n_components many dimensions) X_test_nmf = nmf.transform(X_test) # back-transform the transformed test-data # (afterwards it's in the original space again) X_test_back = np.dot(X_test_nmf, nmf.components_) reduced_images.append(X_test_back) return reduced_images
def load_religion(self, path="data/religion.DTA", k=5): """Return nmf features from a STATA file""" # http://www.thearda.com/Archive/Files/Downloads/RCMSCY10_DL2.asp df = pd.read_stata(path) id_df = df[["stcode", "cntycode"]].copy() id_df.columns = ["st_num", "county_num"] cols = [x for x in df.columns if "rate" in x] # only take percentage cols nmf_data = df[cols].fillna(0) model = NMF(n_components=k).fit(nmf_data) features = model.transform(nmf_data) nmf_feats = pd.DataFrame(features) # Name columns for interperatibility nmf_feats.columns = ["relig_nmf_feat_" + str(x) for x in list(nmf_feats.columns)] # Join the NMF. k = number of topics / cols to add output = id_df.join(nmf_feats) return output
def get_topics(n_components=10, n_top_words=15, print_output=True): custom_stop_words = make_stop_words(new_stop_words) tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words) tfidf = tfidf_vectorizer.fit_transform(release_texts) tfidf = row_normalize_tfidf(tfidf) nmf = NMF(n_components=n_components, random_state=1) # nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5) nmf.fit(tfidf) W = nmf.transform(tfidf) if print_output: print("\nTopics in NMF model:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) return tfidf, nmf, W