def plot_perplexity_iter(A_tfidf, num_topics): print "computing perplexity vs iter..." max_iter = 5 perplexity = [] em_iter = [] for sweep in range(1,max_iter+1): lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_iter.npy', perplexity) f = plt.figure() plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_iter.png')
def plot_perplexity_batch(A_tfidf, num_docs): print "computing perplexity vs batch size..." max_iter = 5 num_topics = 10 batch_size = np.logspace(6, 10, 5, base=2).astype(int) perplexity = np.zeros((len(batch_size),max_iter)) em_iter = np.zeros((len(batch_size),max_iter)) for ii, mini_batch in enumerate(batch_size): for jj, sweep in enumerate(range(1,max_iter+1)): lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = mini_batch, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic) perplexity[ii,jj] = lda.perplexity(A_tfidf) em_iter[ii,jj] = lda.n_batch_iter_ #end #end np.save('./data/perplexity.npy', perplexity) np.save('./data/em_iter.npy', em_iter) f = plt.figure() for mb in range(len(batch_size)): plt.plot(em_iter[mb,:], perplexity[mb,:], color=np.random.rand(3,), marker='o', lw=2.0, label='mini_batch: '+str(batch_size[mb])) plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_batch.png')
def plot_perplexity_topics(A_tfidf): print "computing perplexity vs K..." max_iter = 5 #based on plot_perplexity_iter() #num_topics = np.linspace(2,20,5).astype(np.int) num_topics = np.logspace(1,2,5).astype(np.int) perplexity = [] em_iter = [] for k in num_topics: lda = LatentDirichletAllocation(n_topics = k, max_iter=max_iter, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "K= %d, elapsed time: %.4f sec" %(k, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_topics.npy', perplexity) np.save('./data/perplexity_topics2.npy', num_topics) f = plt.figure() plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('Number of Topics, K') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_topics.png')
def extractTopicLDA(func_message_dic, store_cloumn): if len(func_message_dic) == 0: print "func_message_dic is null" return False try: conn=MySQLdb.connect(host='192.168.162.122',user='******',passwd='123456',port=3306) cur=conn.cursor() cur.execute('set names utf8mb4') conn.select_db('codeAnalysis') for function in func_message_dic: message = func_message_dic[function] np_extractor = nlp.semantics_extraction.NPExtractor(message) text = np_extractor.extract() if len(text) == 0: continue tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(text) print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() seprator = " " for topic_idx, topic in enumerate(lda.components_): keywords = seprator.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) sql = "update func_semantic set "+store_cloumn+" = '"+keywords+"' where func_name = '"+function+"'" print sql cur.execute(sql) conn.commit() cur.close() conn.close() return True except MySQLdb.Error,e: print e raise
def lda_tuner(ingroup_otu, best_models): best_score = -1*np.inf dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] topic_series = [3] X = ingroup_otu.values eval_counter = 0 for topics in topic_series: for dtp in dtp_series: for twp in twp_series: eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=dtp, topic_word_prior=twp, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, topics, dtp, twp, this_score, this_perplexity) best_models.append({'n': topics, 'dtp': dtp, 'twp': twp, 'score': this_score, 'perp': this_perplexity}) if (dtp == dtp_series[-1]) and (twp == twp_series[-1]): eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=1./topics, topic_word_prior=1./topics, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, topics, (1./topics), (1./topics), this_score, this_perplexity) best_models.append({'n': topics, 'dtp': (1./topics), 'twp': (1./topics), 'score': this_score, 'perp': this_perplexity}) return best_models
def fit_lda(tf): '''takes in a tf sparse vector and finds the top topics''' lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() lda_topic_dict = print_top_words(lda, tf_feature_names, n_top_words) return lda, lda_topic_dict
def LDA(tf,word): lda = LatentDirichletAllocation(n_topics=30, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) print_top_words(lda,word,20)
def applyLDA2(self, number_of_clusters, country_specific_tweets): train, feature_names = self.extractFeatures(country_specific_tweets,False) name = "lda" if self.results: print("Fitting LDA model with tfidf", end= " - ") t0 = time() lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(train) if self.results: print("done in %0.3fs." % (time() - t0)) parameters = lda.get_params() topics = lda.components_ doc_topic = lda.transform(train) top10, labels = self.printTopicCluster(topics, doc_topic, feature_names) labels = numpy.asarray(labels) if self.results: print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels))) return name, parameters, top10, labels
def topicmodel( comments ): _texts = [] texts = [] for c in comments: c = c['text'] _texts.append( c ) texts.append( c ) tf_vectorizer = CountVectorizer( max_df=.20, min_df=10, stop_words = stopwords ) texts = tf_vectorizer.fit_transform( texts ) ## test between 2 and 20 topics topics = {} for k in range(2, 10): print "Testing", k model = LatentDirichletAllocation( n_topics= k , max_iter=5, learning_method='batch', learning_offset=50., random_state=0 ) model.fit( texts ) ll = model.score( texts ) topics[ ll ] = model topic = max( topics.keys() ) ret = collections.defaultdict( list ) ## ugly, rewrite some day model = topics[ topic ] ## for debug pront chosen models' names feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print "Topic #%d:" % topic_idx print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]]) print for i, topic in enumerate( model.transform( texts ) ): topic = numpy.argmax( topic ) text = _texts[ i ].encode('utf8') ret[ topic ].append( text ) return ret
class LDATopics: # Constructor def __init__(self, filename): # Member variables self.email_data = [] self.lda = None self.feature_names = None self.num_topics = NUM_TOPICS self.num_words_per_topic = NUM_WORDS_PER_TOPIC self.num_features = NUM_FEATURES # Load emails from full path to file emails = EmailLoader(filename).get_email_dict_array() # Process emails into a list of email body contents for email_rec in emails: if email_rec['body']: # Clean the text and add to list cleaner = TextCleaner(email_rec['body']) self.email_data.append(" ".join(cleaner.tokenize_str())) ## Public methods ## def process(self, topics=None, features=None): # Check if default numbers should be used if topics is None: topics = self.num_topics if features is None: features = self.num_features # Calculate term frequency for LDA tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=features, stop_words='english') tf = tf_vectorizer.fit_transform(self.email_data) # Fit the LDA model to data samples self.lda = LatentDirichletAllocation(n_topics=topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) self.lda.fit(tf) # Set the feature name (words) self.feature_names = tf_vectorizer.get_feature_names() def print_topics(self, words_per_topic=None): # Check if default number of words per topics should be used if words_per_topic is None: words_per_topic = self.num_words_per_topic self._print_topics(self.lda, self.feature_names, words_per_topic) ## Private methods ## def _print_topics(self, model, feature_names, words_per_topic): for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-words_per_topic - 1:-1]])) print()
def perform_analysis(self, stocks, szTimeAxis, n_ahead): # load Snowball comment data from agares.datasource.snowball_cmt_loader import SnowballCmtLoader SBLoader = SnowballCmtLoader() date = self.dt_start.date() df_cmt_list = [] while date <= self.dt_end.date(): df_cmt_list.append(SBLoader.load(str(date))) date += timedelta(days=1) df_cmt = pd.concat(df_cmt_list, ignore_index=True) # Chinese text segmentation self.set_jieba() df_cmt['RawComment'] = df_cmt['RawComment'].map(jieba.cut) # drop stopwords self.stopwords = [line.strip() for line in open('stopwords').readlines()] self.stopwords.append(' ') df_cmt['RawComment'] = df_cmt['RawComment'].map(self.drop_useless_word) cmt = df_cmt['RawComment'].tolist() # construct tfidf matrix tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=0.05) tfidf = tfidf_vectorizer.fit_transform(cmt) # Fit the NMF model n_topics = 5 n_top_words = 20 print("Fitting the NMF model with tf-idf features..") t0 = time() nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() self.print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the LDA model print("Fitting LDA models with tf-idf features..") lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") self.print_top_words(lda, tfidf_feature_names, n_top_words) # load sz daily candlestick data sz = next(iter(stocks)) cst_Day = stocks[sz].cst['1Day'] # print close price within the timescope date = self.dt_start print() print("The ShangHai stock Index (close index) within the timescope") while date <= self.dt_end: ts = pd.to_datetime(date) try: print("Date: {0:s}, Index: {1:.2f}".format(str(date.date()), cst_Day.at[ts, 'close'])) except KeyError: # sz candlestick data does not exist at this datetime print("Date: {0:s}, Index: (market closed)".format(str(date.date()))) date += timedelta(days=1)
def LDA(matrix,preserve,n_topics=100): lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method='online', learning_offset=50., random_state=randint(1,100)) lda.fit(matrix[preserve]) topic_model=lda.transform(matrix) return topic_model
def get_lda(): lda = LatentDirichletAllocation( n_topics=K, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(X) tf_feature_names = VECTORIZER.get_feature_names() print_top_words(lda, tf_feature_names, 10) return lda
def calculate_lda(self, tfidf): print("Fitting LDA models with tf features...") lda = LatentDirichletAllocation(n_topics=self.num_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tfidf) print("Topics in LDA model:") print_top_words(lda, self.tfidf_feature_names, self.num_words) print("done in %0.3fs." % (time() - t0))
def test_perplexity_input_format(): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) lda.fit(X) perp_1 = lda.perplexity(X) perp_2 = lda.perplexity(X.toarray()) assert_almost_equal(perp_1, perp_2)
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_dense_input(): # Test LDA with dense input. rng = np.random.RandomState(0) n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, learning_method='batch', random_state=rng) lda.fit(X.toarray()) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_multi_jobs(): # Test LDA batch training with multi CPU for method in ('online', 'batch'): rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=3, learning_method=method, random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_fit_batch(): # Test LDA batch learning_offset (`fit` method with 'batch' learning) rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, evaluate_every=1, learning_method='batch', random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def LDA_feature_extraction(text_lst, n_samples, n_features, n_topics, n_top_words): print "Extracting tf features for LDA..." tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(text_lst) print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) print "\nTopics in LDA model:" tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) print "*************end LDA****************"
def test_lda_multi_jobs(method): n_components, X = _build_sparse_mtx() # Test LDA batch training with multi CPU rng = np.random.RandomState(0) lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_method=method, evaluate_every=1, random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def main(): with codecs.open("wallsDB.txt","r",encoding='utf-8') as f: walls = f.read().split("\t\n\t") vectorizer = CountVectorizer(max_df=0.95, min_df=2) F = vectorizer.fit_transform(walls) vocab = vectorizer.vocabulary_ lda = LatentDirichletAllocation(n_topics=1000, max_iter=10, learning_method='online', learning_offset=30., random_state=777) lda.fit(F) save_obj(lda, "Phi") save_obj(vocab, "vocab")
def lauch_lda(featured, n_topics=10, n_top_words=20): """ Latent Dirichlet Allocation with online variational Bayes algorithm """ # Use tf (raw term count) features for LDA. print "extracting tf features for LDA..." tf_vectorizer = CountVectorizer(preprocessor=custom_preprocessor, max_df=0.95, min_df=2) # max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(featured) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names) return load_top_words(lda, tf_feature_names, n_top_words)
def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics)) assert_raises_regexp(ValueError, r'Number of samples', lda.perplexity, X, invalid_n_samples) # invalid topic number invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1)) assert_raises_regexp(ValueError, r'Number of topics', lda.perplexity, X, invalid_n_topics)
class LDA(): def __init__(self, args=None, from_file=None): # Initialize LDA model from either arguments or a file. If both are # provided, file will be used. assert args or from_file, 'Improper initialization of LDA model' if from_file is not None: with open(from_file, 'rb') as f: self.model, self.vectorizer = pickle.load(f, encoding='latin1') else: # training for the first time self.vectorizer = TfidfVectorizer(lowercase=False, token_pattern=u'[^;]+') self.alpha = args.alpha self.beta = args.beta self.ntopics = args.ntopics self.model = None def top_words(self, n): features = self.vectorizer.get_feature_names() words = [OrderedDict([(features[i], topic[i]) for i in topic.argsort()[:-n - 1:-1]]) for topic in self.model.components_] return words def train(self, docs): data = [';'.join(bow) for bow in docs] vect = self.vectorizer.fit_transform(data) self.alpha = self.alpha if self.alpha is not None else 50./self.ntopics self.beta = self.beta if self.beta is not None else 200./len(self.vectorizer.vocabulary_) print('{} words in vocabulary'.format(len(self.vectorizer.vocabulary_))) print('Training LDA with {} topics, {} alpha, {} beta'.format(self.ntopics, self.alpha, self.beta)) self.model = LatentDirichletAllocation(self.ntopics, doc_topic_prior=self.alpha, topic_word_prior=self.beta, learning_method='batch', max_iter=100, verbose=1, evaluate_every=1, max_doc_update_iter=100, mean_change_tol=1e-5) self.model.fit(vect) # normalizing does not change subsequent inference, provided no further training is done self.model.components_ /= self.model.components_.sum(axis=1)[:, np.newaxis] def infer(self, docs): data = [';'.join(bow) for bow in docs] vect = self.vectorizer.transform(data) dist = self.model.transform(vect) assert vect.shape[0] == dist.shape[0] # NOTE: if a document is empty, this method returns a zero topic-dist vector samples = [list(doc_topic_dist) if m.nnz > 0 else ([0.] * self.model.n_components) for m, doc_topic_dist in zip(vect, dist)] return samples
def test_lda_fit_perplexity(): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', random_state=0, evaluate_every=1) lda.fit(X) # Perplexity computed at end of fit method perplexity1 = lda.bound_ # Result of perplexity method on the train set perplexity2 = lda.perplexity(X) assert_almost_equal(perplexity1, perplexity2)
def run(self): # Use tf-idf features for NMF. with self.input().open('r') as f: data = json.loads(f.read()) data_samples = data['data'] print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features, stop_words='english') t0 = time() tfidf = tfidf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) # Fit the NMF model print("Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) #exit() print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() tw = get_top_words(nmf, tfidf_feature_names, n_top_words) print(tw) print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() tw = get_top_words(lda, tf_feature_names, n_top_words) with self.output().open('w') as out_f: out_f.write(json.dumps(tw))
def topicExtractionLDA(): output = open("../../result/topic_extraction", "wr") conn= MySQLdb.connect(host='localhost', port = 3306, user='******', passwd='wangyu', db ='vccfinder') cur = conn.cursor() sql = "select cluster from commit_cluster_600 group by cluster" cur.execute(sql) result = cur.fetchall() clusterids = [] if None != result: for item in result: clusterids.append(item[0]) print("finish get cluster ids...") for clusterid in clusterids: text = [] sql = "select message from commits, commit_cluster_600 where commits.id = commit_cluster_600.original_id and cluster = " + str(clusterid) cur.execute(sql) result = cur.fetchall() print("finish get messages...") if None != result: output.writelines("\n====================start " + str(clusterid) + "====================") for message in result: setence = message[0].replace("\n", "").replace("_", " ").replace("---", " ") filtered_setence = "" words = setence.split() for word in words: word = filter(str.isalnum, str(word)) if word != "": filtered_setence += word + " " filtered_setence = filtered_setence.rstrip() filtered_setence += "." #print(filtered_setence) text.append(filtered_setence) print("finish build text array... then extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(text) print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words, output) output.close() cur.close() conn.commit() conn.close()
def calculate_lda_for_chinese_restaurants(): print 'Calculating LDA...' n_features = 1000 n_topics = 10 n_top_words = 5 t0 = time() chinese_reviews = get_chinese_restaurants_reviews(get_chinese_restaurants()) print("done in %0.3fs." % (time() - t0)) # Use tf-idf features for Non-negative matrix factorization. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tfidf = tfidf_vectorizer.fit_transform(chinese_reviews) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(chinese_reviews) print("done in %0.3fs." % (time() - t0)) # Fit LDA model for tf features print("Fitting LDA models with tf features, " "n_features=%d..." % n_features) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words)
def find_topics(df_train, df_test, n_topics): #http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation # Use tf (raw term count) features for LDA. print("Extracting character frequency features for topic modeling...") #Need to create a dtm with combined (train/test) vocabulary in columns n_train = df_train.shape[0] df_combined = df_train.copy(deep = True).append(df_test.copy(deep = True)) vectorizer = CountVectorizer(decode_error = 'strict', analyzer = 'char') corpus_combined = df_combined.loc[:,'text_read'] dtm_combined = vectorizer.fit_transform(corpus_combined) #split the train and test data again to ensure we only use test set for #supervised cross-validated learning dtm_train = dtm_combined[:n_train,:] dtm_test = dtm_combined[n_train:,:] print("Fitting LDA models with character frequency features...") #This requires sklearn.__version__ to be 0.17.X or greater lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch', random_state=0) #fit to the training document term matrix lda.fit(dtm_train) #create topic 'names' and columns in dataframe topic_names = [] for i in range(0, n_topics): name = 't' + str(i+1) topic_names.append(name) df_train.loc[:, name] = 0.0 df_test.loc[:, name] = 0.0 df_train.loc[:, topic_names] = lda.transform(dtm_train) df_test.loc[:, topic_names] = lda.transform(dtm_test) #normalize these topic features df_train = normalize_features(df_train, topic_names) df_test = normalize_features(df_test, topic_names) return df_train
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""): train = False narratives = [] keywords = [] # Get the xml from file root = etree.parse(infile).getroot() if dict_keys == None: train = True # Set up the keys for the feature vector dict_keys = ["MG_ID", labelname] if checklist in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"] elif dem in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"] print "dict_keys: " + str(dict_keys) #keywords = set([]) #narrwords = set([]) print "train: " + str(train) print "stem: " + str(stem) print "lemma: " + str(lemma) # Extract features matrix = [] for child in root: features = {} if rec_type in featurenames: features["CL_" + rec_type] = child.tag # CHECKLIST features for key in dict_keys: if key[0:3] == "CL_": key = key[3:] item = child.find(key) value = "0" if item != None: value = item.text if key == "AlcoholD" or key == "ApplytobaccoD": if value == 'N': value = 9 features[key] = value #print "-- value: " + value #if key == "MG_ID": # print "extracting features from: " + value # KEYWORD features if kw_features: keyword_string = get_keywords(child) # Remove punctuation and trailing spaces from keywords words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')] # Split keyword phrases into individual words for word in words: w = word.split(' ') words.remove(word) for wx in w: words.append(wx.strip().strip('–')) keywords.append(" ".join(words)) # NARRATIVE features if narr_features or ((not train) and (symp_train in featurenames)): narr_string = "" item = child.find(element) if item != None: if item.text != None: narr_string = item.text.encode("utf-8") else: print "warning: empty narrative" narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] text = " ".join(narr_words) if stem: narr_string = preprocessing.stem(text) elif lemma: narr_string = preprocessing.lemmatize(text) narratives.append(narr_string.strip().lower()) #print "Adding narr: " + narr_string.lower() # SYMPTOM features elif train and (symp_train in featurenames): narr_string = "" item = child.find("narrative_symptoms") if item != None: item_text = item.text if item_text != None and len(item_text) > 0: narr_string = item.text.encode("utf-8") #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] narratives.append(narr_string.lower()) print "Adding symp_narr: " + narr_string.lower() # Save features matrix.append(features) # Construct the feature matrix # COUNT or TFIDF features if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = [] if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = narratives print "narratives: " + str(len(narratives)) elif kw_count in featurenames or kw_tfidf in featurenames: documents = keywords print "keywords: " + str(len(keywords)) # Create count matrix global count_vectorizer if train: print "training count_vectorizer" count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords) count_vectorizer.fit(documents) dict_keys = dict_keys + count_vectorizer.get_feature_names() print "transforming data with count_vectorizer" count_matrix = count_vectorizer.transform(documents) matrix_keys = count_vectorizer.get_feature_names() print "writing count matrix to file" out_matrix = open(infile + ".countmatrix", "w") out_matrix.write(str(count_matrix)) out_matrix.close() # Add count features to the dictionary for x in range(len(matrix)): feat = matrix[x] for i in range(len(matrix_keys)): key = matrix_keys[i] val = count_matrix[x,i] feat[key] = val # Convert counts to TFIDF if (narr_tfidf in featurenames) or (kw_tfidf in featurenames): print "converting to tfidf..." print "matrix_keys: " + str(len(matrix_keys)) # Use the training count matrix for fitting if train: global tfidfTransformer tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer() tfidfTransformer.fit(count_matrix) # Convert matrix to tfidf tfidf_matrix = tfidfTransformer.transform(count_matrix) print "count_matrix: " + str(count_matrix.shape) print "tfidf_matrix: " + str(tfidf_matrix.shape) # Replace features in matrix with tfidf for x in range(len(matrix)): feat = matrix[x] #values = tfidf_matrix[x,0:] #print "values: " + str(values.shape[0]) for i in range(len(matrix_keys)): key = matrix_keys[i] val = tfidf_matrix[x,i] feat[key] = val # LDA topic modeling features if lda in featurenames: global ldaModel if train: ldaModel = LatentDirichletAllocation(n_topics=num_topics) ldaModel.fit(count_matrix) lda_matrix = ldaModel.transform(count_matrix) for t in range(0,num_topics): dict_keys.append("lda_topic_" + str(t)) for x in range(len(matrix)): for y in range(len(lda_matrix[x])): val = lda_matrix[x][y] matrix[x]["lda_topic_" + str(y)] = val # TODO: Print LDA topics # WORD2VEC features elif narr_vec in featurenames: print "Warning: using word2vec features, ignoring all other features" # Create word2vec mapping word2vec, dim = load_word2vec(vecfile) # Convert words to vectors and add to matrix dict_keys.append(narr_vec) global max_seq_len max_seq_len = 200 #if train: #max_seq_len = 0 print "word2vec dim: " + str(dim) print "initial max_seq_len: " + str(max_seq_len) zero_vec = [] for z in range(0, dim): zero_vec.append(0) for x in range(len(matrix)): narr = narratives[x] #print "narr: " + narr vectors = [] vec = zero_vec for word in narr.split(' '): if len(word) > 0: #if word == "didnt": # word = "didn't" if word in word2vec: vec = word2vec[word] vectors.append(vec) length = len(vectors) if length > max_seq_len: #if train: # max_seq_len = length vectors = vectors[(-1*max_seq_len):] (matrix[x])[narr_vec] = vectors # Pad the narr_vecs with 0 vectors print "padding vectors to reach maxlen " + str(max_seq_len) for x in range(len(matrix)): length = len(matrix[x][narr_vec]) matrix[x]['max_seq_len'] = max_seq_len if length < max_seq_len: for k in range(0, max_seq_len-length): matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding # narr_seq for RNN elif narr_seq in featurenames: global vocab_size, max_seq_len if train: dict_keys.append(narr_seq) dict_keys.append('vocab_size') dict_keys.append('max_seq_len') vocab = set() for narr in narratives: words = narr.split(' ') for word in words: vocab.add(word) vocab_size = len(vocab) max_seq_len = 0 sequences = [] # Convert text into integer sequences for x in range(len(matrix)): narr = narratives[x] seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ') if len(seq) > max_seq_len: max_seq_len = len(seq) sequences.append(seq) # Pad the sequences sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre') for x in range(len(matrix)): matrix[x]['narr_seq'] = sequences[x] matrix[x]['vocab_size'] = vocab_size matrix[x]['max_seq_len'] = max_seq_len #if arg_rebalance != "": # matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance) # write_to_file(matrix_re, dict_keys, outfile) #else: data_util.write_to_file(matrix, dict_keys, outfile)
tf_truthful = tf_vectorizer_truthful.fit_transform(pos_truthful + neg_truthful) tf_vectorizer_deceptive = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf_deceptive = tf_vectorizer_deceptive.fit_transform(pos_deceptive + neg_deceptive) print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda_neg_review = LatentDirichletAllocation(n_components=n_components, max_iter=20, learning_method='online', learning_offset=50., random_state=0) lda_neg_review.fit(tf_pos_review) # print("\nTopics in LDA model of negative deceptive:") tf_feature_names = tf_vectorizer_pos_review.get_feature_names() top_word_list = print_top_words(lda_neg_review, tf_feature_names, n_top_words) ''' Start ''' lda_neg_deceptive = LatentDirichletAllocation(n_components=n_components, max_iter=20, learning_method='online', learning_offset=50., random_state=0) lda_neg_deceptive.fit(tf_pos_deceptive) print("\nTopics in LDA model of negative deceptive:") tf_feature_names = tf_vectorizer_pos_deceptive.get_feature_names()
# LDA’s objective is to maximize separation between means of projected topics and # minimize variance within each projected topic # So LDA defines each topic as a bag of words by carrying out three steps described below. # Step 1: Initialize k clusters and assign each word in the document to one of the k topics. # Step 2: Re-assign word to new topic based on a) how is the proportion of words # for a document to a topic, and b) how is the proportion of a topic widespread across all documents. # Step 3: Repeat step 2 until coherent topics result. from sklearn.decomposition import LatentDirichletAllocation # continuing with the 20 newsgroup dataset and 3 topics total_topics = 3 lda = LatentDirichletAllocation(n_topics=total_topics,max_iter=100,learning_method='online',learning_offset=50.,random_state=2017) lda.fit(X) feature_names = np.array(vectorizer.get_feature_names()) for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-20 - 1:-1]])) # Non-negative Matrix Factorization :- # NMF is a decomposition method for multivariate data, and is given by V = MH, where V # is the product of matrices W and H. W is a matrix of word rank in the features, and H is # the coefficient matrix with each row being a feature. The three matrices have no negative # elements. from sklearn.decomposition import NMF nmf = NMF(n_components=total_topics, random_state=2017, alpha=.1, l1_ratio=.5) nmf.fit(X) for topic_idx, topic in enumerate(nmf.components_):
# Remove special characters, stopwords, twitter IDs, and hashtags. cleanedTweets = [clean_text(tweet) for tweet in tweets] # Train a topic (LDA) model. lda = LatentDirichletAllocation(n_topics=5, max_iter=5, learning_method='online', learning_offset=50., random_state=0) vectorizer = TfidfVectorizer() tf = vectorizer.fit_transform(cleanedTweets) feature_names = vectorizer.get_feature_names() lda.fit(tf) topic_words = [] for topic in lda.components_: word_idx = np.argsort(topic)[::-1][0:1] topic_words.append([feature_names[i] for i in word_idx][0]) print topic_words # Construct topic groups. cameron = [tweet for tweet in cleanedTweets if tweet.find('cameron')>-1] farage = [tweet for tweet in cleanedTweets if tweet.find('farage')>-1] claim = [tweet for tweet in cleanedTweets if tweet.find('claim')>-1] ukip = [tweet for tweet in cleanedTweets if tweet.find('ukip')>-1]
train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData( rawdialogue, content, tag, 0.2) # 得到单词-文档共现矩阵 vectorizer = CountVectorizer(encoding='unicode', stop_words='english', max_features=N_FEATURES) train_data = vectorizer.fit_transform(train_content) train_tag = np.array(train_tag) test_data = vectorizer.fit_transform( test_content) # [n_samples, n_features] model = LDA(n_topics=N_TOPICS, max_iter=5, batch_size=128) model.fit(train_data) train_data_distr = model.transform(train_data) pred_tag = train_data_distr.argmax(axis=1) # 投票 id2class = dict() for idx in range(N_TOPICS): idxs = np.where(pred_tag == idx)[0] # print Counter(train_tag[idxs]) id2class[idx] = Counter(train_tag[idxs]).most_common(1)[0][0] print id2class doc_topic_distr = model.transform(test_data) # [n_samples, n_topics] class_id = doc_topic_distr.argmax(axis=1) pred = [id2class[each] for each in class_id] pred = np.array(pred)
#fit LDA topic model based on tf-idf of term-document matrix num_features = dictionary_size num_topics = 8 #fixed for LDA #fit LDA model print "Fitting LDA model..." lda_vb = LatentDirichletAllocation(n_topics=num_topics, max_iter=10, learning_method='online', batch_size=512, random_state=0, n_jobs=1) tic = time() lda_vb.fit(A.T) #online VB toc = time() print "elapsed time: %.4f sec" % (toc - tic) print "LDA params" print lda_vb.get_params() print "number of EM iter: %d" % lda_vb.n_batch_iter_ print "number of dataset sweeps: %d" % lda_vb.n_iter_ #topic matrix W: K x V #components[i,j]: topic i, word j #note: here topics correspond to label clusters topics = lda_vb.components_ f = plt.figure() plt.matshow(topics, cmap='gray')
doc = doc.lower() doc_cleaned = ' '.join( lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in all_names) data_cleaned.append(doc_cleaned) from sklearn.feature_extraction.text import CountVectorizer count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2) data = count_vector.fit_transform(data_cleaned) from sklearn.decomposition import LatentDirichletAllocation t = 20 lda = LatentDirichletAllocation(n_components=t, learning_method='batch', random_state=42) lda.fit(data) print(lda.components_) terms = count_vector.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic {}:".format(topic_idx)) print(" ".join([terms[i] for i in topic.argsort()[-10:]]))
analyzer='char', stop_words=None, max_df=0.999) cv = count_vectorizer.fit_transform(docs) k = cv.todense() # lda.fit(cv) # change alpha y = list() x = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for i in range(10): lda = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior=alpha, topic_word_prior=float((i + 1) * 5), learning_method='online') lda.fit(cv) word_dist, topics = print_top_words(lda, word_types, 20) word_sampling = 0 for j in range(3): arr = word_dist[j] word_sampling = word_sampling + entropy(arr) print word_sampling / 3.0 y.append(word_sampling / 3.0) # y.append(entropy(topics)) plt.plot(x, y) plt.xlabel("Alpha") plt.ylabel("Entropy") plt.title("Entropy of Topic Distribution") plt.show() # true topic distribution
def build_dimensionality_reduction_model(data, model_type, cross_validate=False, num_iters=10): ''' This function fits a dimensionality reduction model (of the type model_type) to the given features input: training_set: the set of features of the data from which to build our model model_type: the scikit-learn model of choice given by user input parameter output: trained model fit to the features of the data ''' # create a model variable model = None if model_type == 'latent dirichlet allocation': # instantiate model with default hyperparameter settings model = LatentDirichletAllocation(n_topics=5) if cross_validate: # create parameter distributions param_distro = {} # create random grid search object model = RandomizedSearchCV(estimator=model, param_distributions=param_distro, n_iter=num_iters, n_jobs=-1, verbose=True) print '\n', '... performing cross-validation', '\n' # cross-validate the model model.fit(data) else: # fit the vanilla model to the data model.fit(data) elif model_type == 'non-negative matrix factorization': # instantiate model with default hyperparameter settings model = NMF(n_components=5) if cross_validate: # create parameter distributions param_distro = {} # create random grid search object model = RandomizedSearchCV(estimator=model, param_distributions=param_distro, n_iter=num_iters, n_jobs=-1, verbose=True) print '\n', '... performing cross-validation', '\n' # cross-validate the model model.fit(data) else: # fit the vanilla model to the data model.fit(data) else: raise NotImplementedError # return the fitted / cross-validated model return model
others.append("al") my_stop_words = text.ENGLISH_STOP_WORDS.union(others) #print(others) #print(my_stop_words) count_vectorizer = CountVectorizer(stop_words=my_stop_words)# Fit and transform the processed titles count_data = count_vectorizer.fit_transform(texts)# Visualise the 10 most common words #count_data = crossRef(count_data, men) #plot_10_most_common_words(count_data, count_vectorizer) # Tweak the two parameters below number_topics = 11 number_words = 10# Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data)# Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words) LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics)) # # this is a bit time consuming - make the if statement True # # if you want to execute visualization prep yourself # # if you want to execute visualization prep yourself if 1 == 1: LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) f.close() # load the pre-prepared pyLDAvis data from disk
'Topic_ID': topic_id_list, 'Topics': feature_names_list }) return topic_df for i in range(5, 11): comments_file = 'data/reddit_tldr/tldr_comments_cleaned_{0}.txt'.format(i) model_file = 'data/reddit_tldr/topic_models/tldr_{0}_topics.csv'.format(i) documents = open(comments_file, 'r') no_features = 1000 no_topics = 10 no_top_words = 10 tf_vectorizer = CountVectorizer(max_df=0.95, max_features=no_features, stop_words='english', min_df=2) tf = tf_vectorizer.fit_transform(documents) tf_feature_names = tf_vectorizer.get_feature_names() lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=1) lda_new = lda.fit(tf) lda_topic_df = display_relevant_topics(lda_new, tf_feature_names, no_top_words) lda_topic_df.to_csv(model_file)
for index, row in df1.iterrows(): # build_WordCloud(row['reviews'],row['brand'],row['asin']) diction = calculate_word_frequency(row['reviews']) s = row['brand'] + row['asin'] freq_list = {s: diction} fp.write(json.dumps(freq_list) + "\n") <<<<<<< HEAD print("Execution Time: ", time.clock()-start) def do_lda(reviews_string): vectorizer = CountVectorizer(stop_words='english', lowercase=True, token_pattern='\s\w+\s', max_df=0.8) vectorized_data = vectorizer.fit_transform(reviews_string) lda = LatentDirichletAllocation(n_components=4, max_iter=15) lda.fit(vectorized_data) components = lda.components_.T features = vectorizer.get_feature_names() labels = {0: [], 1: [], 2: [], 3: []} stop_words = set(stopwords.words('english')) for i in range(len(features)): label = np.argmax(components[i]) word = features[i].lower().strip() if word not in stop_words: labels[label].append(word) return labels def get_frequency_table(documents): giant_document = " ".join(documents) all_words = giant_document.split()
variety_dict = Counter(wine_df['variety']) most_common = [t[0] for t in variety_dict.most_common(20)] # vectrize vect = CountVectorizer(stop_words='english', lowercase=True, min_df=10) #vect = CountVectorizer(tokenizer = my_tokenizer) counter = vect.fit_transform(wine_df['description']) transf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) # TfidfTransformer takes the CountVectorizer output and computes the tf-idf tf_idf = transf.fit_transform(counter) lda = LatentDirichletAllocation(n_components=20, random_state=0) lda.fit(counter) lda.transform(counter) tf_feature_name = vect.get_feature_names() def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % topic_idx message += " ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) print(message) print() print_top_words(lda, tf_feature_name, 10)
class LDAModel(BenchmarkModel): def __init__(self, n_components, max_features, max_df, min_df, learning_method="batch", learning_decay=0.7, cores=1, epochs=10): super().__init__() self.n_components = n_components self.cores = cores self.epochs = epochs self.max_features = max_features self.max_df = max_df self.min_df = min_df self.learning_method = learning_method self.learning_decay = learning_decay def build_model(self): super().build_model() self.model = LatentDirichletAllocation( n_components=self.n_components, learning_method=self.learning_method, learning_decay=self.learning_decay, n_jobs=self.cores, max_iter=self.epochs) self.count_vectorizer = CountVectorizer(max_features=self.max_features, max_df=self.max_df, min_df=self.min_df, stop_words='english') def train(self, x, y=None): logging.info("Building vocabulary on " + self.__class__.__name__) t0 = time.time() processed_dataset = process_dataset(x) processed_dataset = processed_dataset.map( lambda x: ' '.join(word for word in x)) doc_term_matrix = self.count_vectorizer.fit_transform( processed_dataset.values.astype('U')) self.model.fit(doc_term_matrix) elapsed = (time.time() - t0) logging.info("Done in %.3fsec" % elapsed) def preprocess_data(self, dataset, y_dataset): logging.info("Transform data on " + self.__class__.__name__) processed_dataset = process_dataset(dataset) processed_dataset = processed_dataset.map( lambda x: ' '.join(word for word in x)) doc_term_matrix = self.count_vectorizer.transform( processed_dataset.values.astype('U')) return self.model.transform(doc_term_matrix) def save(self, path): logging.info("Saving " + self.__class__.__name__) combined_path = os.path.join(path, self.__class__.__name__) pickle.dump(self.clf, open(combined_path + "_clf.pickle", 'wb')) pickle.dump(self.model, open(combined_path + "_model.pickle", 'wb')) pickle.dump(self.count_vectorizer.vocabulary_, open(combined_path + "_vec.pickle", 'wb')) def load(self, path): logging.info("Loading " + self.__class__.__name__) combined_path = os.path.join(path, self.__class__.__name__) self.clf = pickle.load(open(combined_path + "_clf.pickle", 'rb')) self.model = pickle.load(open(combined_path + "_model.pickle", 'rb')) self.count_vectorizer = CountVectorizer( vocabulary=pickle.load(open(combined_path + "_vec.pickle", 'rb'))) def can_load(self, path): combined_path = os.path.join(path, self.__class__.__name__) return os.path.isfile(combined_path + "_clf.pickle") and \ os.path.isfile(combined_path + "_model.pickle") and \ os.path.isfile(combined_path + "_vec.pickle")
#print model.components_.shape #print model.components_ #doc_topic_prior=[0.001, 0.01, 0.05, 0.1, 0.2,0.5] #topic_word_prior=[0.001, 0.01, 0.05, 0.1, 0.2,0.5] #topics=[50,100,500,1000] #iters=[50,100,500,1000] #plex=[] lda = LatentDirichletAllocation(n_components=100, max_iter=100, learning_method='batch', doc_topic_prior=0.5, topic_word_prior=0.2) lda_begin_time = time.time() lda.fit(X_tfidf_train) lda_end_time = time.time() print "LDA training time:%fs" % (lda_end_time - lda_begin_time) X_tfidf_train = lda.transform(X_tfidf_train) X_tfidf_test = lda.transform(X_tfidf_test) X_train = np.concatenate((X_dcr_train, X_tfidf_train), axis=1) #km=KMeans(n_clusters=30) #km_begin_time=time.time() #km.fit(X_train) #km_end_time=time.time() #print "KMeans training time:%fs" % (km_end_time-km_begin_time) #print calinski_harabaz_score(X_train,km.labels_) #print km.labels_ #ms=MeanShift() #ms_begin_time=time.time() #ms.fit(X_train)
def compute_lda_model(input_dir, output_file, n_topics=500, format="corenlp", extension="xml", use_lemmas=False, stemmer="porter", language="english"): """ Compute a LDA model from a collection of documents. Latent Dirichlet Allocation is computed using sklearn module. Args: input_dir (str): the input directory. output_file (str): the output file. n_topics (int): number of topics for the LDA model, defaults to 500. format (str): the input files format, defaults to corenlp. extension (str): file extension for input documents, defaults to xml. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. language (str): the language of the documents, used for stop_words in sklearn CountVectorizer, defaults to 'english'. """ # texts container texts = [] # loop throught the documents for input_file in glob.glob(input_dir + '/*.' + extension): # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # container for current document text = [] # loop through sentences for sentence in doc.sentences: # get the tokens (stems) from the sentence if they are not # punctuation marks text.extend([ sentence.stems[i] for i in range(sentence.length) \ if not re.search('[^A-Z$]', sentence.pos[i]) ]) # add the document to the texts container texts.append(' '.join(text)) # vectorize dataset # get the stoplist from nltk because CountVectorizer only contains english # stopwords atm tf_vectorizer = CountVectorizer(stop_words=stopwords.words(language)) tf = tf_vectorizer.fit_transform(texts) # extract vocabulary vocabulary = tf_vectorizer.get_feature_names() # create LDA model and train lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0, learning_method='batch') lda_model.fit(tf) # save all data necessary for later prediction saved_model = (vocabulary, lda_model.components_, lda_model.exp_dirichlet_component_, lda_model.doc_topic_prior_) # Dump the df container logging.info('writing LDA model to ' + output_file) with gzip.open(output_file, 'wb') as fp: pickle.dump(saved_model, fp)
from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix #create the document term matrix vectorizer = CountVectorizer(max_df=0.8, min_df=4, stop_words='english') doc_term_matrix = vectorizer.fit_transform( tweets_data['tweettext'].values.astype('U')) #Generate the LDA with the top 4 topics in the argument. Use random seed 35. LDA = LatentDirichletAllocation(n_components=4, random_state=35) LDA.fit(doc_term_matrix) #Retrieve words in the first topic, sort the indexes according to probability using argsort() first_topic = LDA.components_[0] top_topic_words = first_topic.argsort()[-10:] for i in top_topic_words: print(vectorizer.get_feature_names()[i]) #top 10 words for each topic for i, topic in enumerate(LDA.components_): print(f'Top 10 words for topic #{i}:') print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]) print('\n') #1. Add a new column to the dataframe containing the LDA topic number topic_values = LDA.transform(doc_term_matrix) topic_values.shape
configs.append(config) # ** running for cfg in configs: for itr in range(1,5): t1 = time.time() ldax = LatentDirichletAllocation(n_jobs = 3, n_components=cfg['n_components'], max_iter = cfg['max_iter'], doc_topic_prior = cfg['doc_topic_prior'], topic_word_prior = cfg['topic_word_prior'] ) ldax.fit(mat_dict[cfg['mat']]) t2 = time.time() print(cfg, t2-t1) flnm = cfg['mat'] + "_iter_" + str(cfg['max_iter']) + "_alpha_" + str(cfg['doc_topic_prior']) + "_beta_" + str(cfg['topic_word_prior']) + '_ncomp' + str(cfg['n_components']) + '_it_' + str(itr) dump(ldax, diag_dir + flnm) # mat_edge_smpl: one iterations takes 1.22 secs # mat_song_smpl: one iteration takes 3.6 secs # mat_cutofs: 3.9 secs # on average 2.9 secs # on average 35 iterations # total of 135*35 = 4725 iterations # 4725*2.9 = 13702.5 secs = 3.8 hours
else: categories[document["category"]] = k docToLabel[str(document['_id'])] = k k = k + 1 labels = np.array(list(docToLabel.values())) #instantiate CountVectorizer() cv = CountVectorizer(stop_words='english') # this steps generates word counts for the words in your docs word_count_vector = cv.fit_transform(docs) # Create and fit the LDA model lda = LDA(n_components=6, n_jobs=-1) lda.fit(word_count_vector) # Print the topics found by the LDA model # print("Topics found via LDA:") # print_topics(lda, cv, 10) documentTopicDistr = lda.transform(word_count_vector) documentTopicDistr = np.array(documentTopicDistr) lda_labels = np.argmax(documentTopicDistr, axis=1) print("Metrici LDA " + stemmer) print(metrics.homogeneity_score(labels, lda_labels)) print(metrics.completeness_score(labels, lda_labels)) print(metrics.v_measure_score(labels, lda_labels)) print(metrics.adjusted_rand_score(labels, lda_labels)) print(metrics.adjusted_mutual_info_score(labels, lda_labels))
print("preprocessing data") df = utils.preprocess_data(df, analyzer, tt) df.to_csv("data/tesi_US_preprocessed.csv", index=None) else: print("loading preprocessed data") df = pd.read_csv("data/tesi_US_preprocessed.csv") print("training vectorizer") TDmat = cv.fit_transform(df['preprocessed']) joblib.dump(cv, "models/cv_{}.pkl".format(n_features)) if isinstance(n_topics, list): topic_numbers = n_topics else: topic_numbers = [n_topics] for num in topic_numbers: lda = LatentDirichletAllocation(n_components=num, max_iter=12, learning_method='online', learning_offset=30., random_state=0, n_jobs=6) print("training lda with {} topics".format(num)) lda.fit(cv.transform(df['preprocessed'])) utils.print_top_words(lda, cv.get_feature_names(), n_top_words) joblib.dump(lda, "models/lda_{}_{}.pkl".format(num, n_features)) utils.visualize_lda(lda, TDmat, cv, True, "html/lda_{}_{}.html".format(num, n_features))
out = out + xy #print(out) return out cntVect = CountVectorizer(stop_words=stop_word_list) cntTf = cntVect.fit_transform(word_cut) list_numb_topics = [10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] list_perplexity = [] for topic_numb in list_numb_topics: #topic_numb = 18 lda = LatentDirichletAllocation(n_components=topic_numb, max_iter=1000, learning_method='batch') lda.fit(cntTf) dic = {} out = "" n_top_words = 20 tf_features_names = cntVect.get_feature_names() out = foo(lda, tf_features_names, n_top_words, out) # print(out) wo = r'C:\Users\admin\Desktop\file\n_topic_numb= %d.txt' % topic_numb file = open(wo, 'w') file.write(out) # print(dic) doc_topic_dist = lda.transform(cntTf) doc_topic_dist = pd.DataFrame( doc_topic_dist, columns=['topic_#%d' % i for i in range(topic_numb)])
listOfCoords = [] aux = l.split(',') for dim in range(len(aux)): listOfCoords.append(float(aux[dim])) # normalize values normalizedListOfCoords = [ (x - min(listOfCoords)) / (max(listOfCoords) - min(listOfCoords)) for x in listOfCoords ] dataset.append(normalizedListOfCoords) X = np.array(dataset) # Create and fit the LDA model lda = LDA(n_components=6, n_jobs=-1) lda.fit(X) # Print the topics found by the LDA model # print("Topics found via LDA:") # print_topics(lda, cv, 10) documentTopicDistr = lda.transform(X) documentTopicDistr = np.array(documentTopicDistr) lda_labels = np.argmax(documentTopicDistr, axis=1) print("Metrici LDA " + file) print(metrics.homogeneity_score(labels, lda_labels)) print(metrics.completeness_score(labels, lda_labels)) print(metrics.v_measure_score(labels, lda_labels)) print(metrics.adjusted_rand_score(labels, lda_labels)) print(metrics.adjusted_mutual_info_score(labels, lda_labels))
# Perform LDA. from sklearn.decomposition import LatentDirichletAllocation, NMF # In[28]: # n_components - number of topics returned. LDA = LatentDirichletAllocation(n_components=3, random_state=42) # In[39]: nmf = NMF(n_components=8, random_state=42) # In[29]: # Fit LDA to document term matrix. LDA.fit(dtm) # In[48]: nmf.fit(dtm_ifidf) # In[30]: # Grab the vocabulary of words. import random random_word_id = random.randint(0, 6924) cv.get_feature_names()[random_word_id] # In[31]:
df = pd.read_csv('articles.csv', parse_dates=['post_published']) text = df['processed_text'].values.tolist() max_features = 5000 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=3, max_features=max_features, stop_words='english') tf = tf_vectorizer.fit_transform(text) print("ready") n_topics = 18 lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda_model.fit(tf) # pyLDAvis.enable_notebook() # pyLDAvis.sklearn.prepare(lda_model,tf, tf_vectorizer, R=20, mds='tsne') ## get the token to topic matrix word_topic = np.zeros((max_features,n_topics),) print(n_topics) lda_model.components_ for topic_idx, topic in enumerate(lda_model.components_): word_topic[:,topic_idx] = topic print("token-topic matrix",word_topic.shape) ## create a matrix of the top words used to define each topic top_words = 20 tf_feature_names = np.array(tf_vectorizer.get_feature_names())
def run_lda(n_samples, n_features, n_components, n_top_words): texts = [] res = elastic_utils.iterate_search( index_name=cfg.twitter_credentials['topic']) for i in res: texts.append(i['_source']['text']) # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tfidf = tfidf_vectorizer.fit_transform(texts) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(texts) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (Frobenius norm):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the NMF model print( "Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() categories = print_top_words(lda, tf_feature_names, n_top_words) predict = lda.transform(tf) result = {"predictions": predict, "text": texts, "categories": categories} return result
total_topics=2, num_terms=5, display_weights=True) from sklearn.decomposition import LatentDirichletAllocation norm_corpus = normalize_corpus(toy_corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf') total_topics = 2 lda = LatentDirichletAllocation(n_topics=total_topics, max_iter=1000, learning_method='online', learning_offset=50., random_state=42) lda.fit(tfidf_matrix) feature_names = vectorizer.get_feature_names() weights = lda.components_ topics = get_topics_terms_weights(weights, feature_names) print_topics_udf(topics=topics, total_topics=total_topics, num_terms=8, display_weights=True) from sklearn.decomposition import NMF norm_corpus = normalize_corpus(toy_corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf')
class TextTopics(): """ Text classifier. """ def __init__(self, df: pd.DataFrame, number_topics=50, instance_path=instance_path(), **kwargs): self._instance_path = instance_path self.number_topics = number_topics self.stop_words: List = get_stop_words("fi") self._count_vector: CountVectorizer = None self._lda: LDA = None self.token_cache = {} self._tokenizer = None self.min_sentence_length = 17 # `kk` is used in assocation with time periods. self.stop_words += ["kk"] self.init(df, kwargs) def init(self, df: pd.DataFrame, generate_visualization=False, lang="fi"): """ :param df: :class:`~pandas.Dataframe` containing text colums :param generate_visualization: Generate visalization of LDA results. Slows down generation notably. :param lang: Language for :class:`~Voikko` """ if self._count_vector and self._lda: return True file_words = self.instance_path() / "word.dat" file_lda = self.instance_path() / "lda.dat" file_ldavis = self.instance_path() / "ldavis.html" try: # Try loading saved lda files. self._count_vector = joblib.load(file_words) self._lda = joblib.load(file_lda) except FileNotFoundError as e: logger.exception(e) texts = [x for x in df.to_numpy().flatten() if x is not np.NaN] # Setup word count vector self._count_vector = CountVectorizer( tokenizer=self.text_tokenize, stop_words=self.stop_words ) count_data = self._count_vector.fit_transform(texts) self._lda = LDA(n_components=self.number_topics, n_jobs=-1) self._lda.fit(count_data) if generate_visualization: logger.debug("Generating LDA visualization. This might take a while") from pyLDAvis import sklearn as sklearn_lda import pyLDAvis LDAvis_prepared = sklearn_lda.prepare(self._lda, count_data, self._count_vector) pyLDAvis.save_html(LDAvis_prepared, str(file_ldavis)) joblib.dump(self._count_vector, file_words) joblib.dump(self._lda, file_lda) def instance_path(self): path = self._instance_path / "lda" / str(self.number_topics) path.mkdir(exist_ok=True, parents=True) return path def tokenizer(self): if not self._tokenizer: self._tokenizer = VoikkoTokenizer("fi") return self._tokenizer @cached(LRUCache(maxsize=1024)) def text_tokenize(self, text): """ Cached wrapper for `VoikkoTokenizer.tokenize()` """ return self.tokenizer().tokenize(text) def find_talkingpoint(self, candidate: pd.Series) -> str: """ Find most suitable sentence from text """ texts = tuple(candidate.dropna()) if len(texts) == 0: return None x = self._get_topics(texts) return self.nearest_sentence(x[1], texts) def nearest_sentence(self, topics: List[float], texts: List[str]) -> str: """ Find sentence closest to topic. TODO: When joining multiple sentences, it should be checked that they are from same paragraph. """ @cached(LFUCache(maxsize=128)) def lda(sentences): count_data = self._count_vector.transform(sentences) _lda = self._lda.transform(count_data) return _lda # Tokenize into sentences. sentences = chain(*[re.findall(r"\s*(.+?[\.!?])+", b, re.MULTILINE + re.DOTALL) for b in texts if b.strip() != ""]) # cleanup sentences. sentences = tuple(set(filter(lambda x: len(x) > self.min_sentence_length, map(str.strip, sentences)))) if len(sentences) == 0: return None # Find most topical sentence. tl_dr = [] distance = 1. prev_sentence = "" for current_sentence, m in zip(sentences, lda(sentences)): _distance = np.abs(np.mean(topics - m)) if _distance < distance: tl_dr, distance = ([prev_sentence, current_sentence], _distance) # Previous sentence is to provide context to most suitable sentence. prev_sentence = current_sentence return " ".join(filter(None, tl_dr)) def compare_series(self, source: pd.Series, target: pd.Series): """ Compare two text sets. First tuple contains topic word not found in :param:`target`, and second tuple contains word not found in :param:`source`. Note: This result will not be cached. Use :method:`compare_rows()` if possible. """ # Convert them into tuples, so they can be cached. _source = tuple(source.dropna()) _target = tuple(target.dropna()) return self.compare_count_data( *self._get_topics(_source), *self._get_topics(_target) ) def compare_rows(self, df: pd.DataFrame, i, l): x = self.row_topics(df, i) y = self.row_topics(df, l) if not x or not y: return None r = self.compare_count_data(*x, *y) return r def row_topics(self, df: pd.DataFrame, idx): """ Return suitable topics from dataset `df` row :param:`idx` """ x = tuple(df.loc[idx].dropna()) if len(x) == 0: return None return self._get_topics(x) @cached(LRUCache(maxsize=512)) def _get_topics(self, source: List) -> Tuple: count_data = self._count_vector.transform(source) return (count_data, self._lda.transform(count_data).mean(axis=0)) def compare_count_data(self, counts_data_source, topics_source, counts_data_target, topics_target) -> Tuple[Tuple[str, int], Tuple[str, int]]: diffs = topics_source - topics_target topic_max = np.argmax(diffs) topic_min = np.argmin(diffs) source_words = self.suggest_topic_word(counts_data_source, counts_data_target, topic_max) target_words = self.suggest_topic_word(counts_data_target, counts_data_source, topic_min) word_for_source = self.suitable_topic_word(source_words) if len(source_words) else None word_for_target = self.suitable_topic_word(target_words) if len(target_words) else None return TopicComparision( source=Topic(id=topic_max, term=word_for_source), target=Topic(id=topic_min, term=word_for_target) ) def suggest_topic_word(self, A, B, topic_id: int) -> List[Tuple[int, float]]: """ Find relevant word for topic. Copares :param:`A` and :param:`B` words, and topic words to find suitable word with enough difference between `A` and `B`. :param A: :class:`csr_matrix` Target to find word for. :param B: :class:`csr_matrix` Comparative target for `A` :param topic_id: lda topic id number. :return: List of tuples in prominen order. First instance in tuple is word vector feature number, and second is prominence value. """ # Generate sum of used words a_sum = A.toarray().sum(0) b_sum = B.toarray().sum(0) # Topic word, prefering unique ones. λ = self._lda.components_[topic_id] / self._lda.components_.sum(0) # Remove words from A that B has used too. # Note: Doesn't actually remove. complement = a_sum - b_sum # Use logarithm, so topic words are prefered. prominence = np.log(complement) * λ # Generate list of words, ordered by prominence r = sorted([(i, prominence[i]) for i in prominence.argsort() if prominence[i] != 0 > -np.inf], key=lambda x: x[1], reverse=True) return r # sequence list is too volatile to be cached. def suitable_topic_word(self, seq: List[List[int, ]]) -> str: """ Find first suitable word from :param:`seq` list. :param: 1d matrix of word feature indexes. Only first column in row is interepted as feature number. """ vector_words = self.vector_words() """ Find first suitable word from word list """ for r in seq: word = vector_words[r[0]] if self._suitable_topic_word(word): return word return None @cached(LFUCache(maxsize=512)) def _suitable_topic_word(self, word) -> bool: """ Check if word can be used as topic word Accepted word classes: :nimi: Names; Words like `Linux` and `Microsoft`, `Kokoomus` :nimisana: Substantives; like `ihminen`, `maahanmuutto`, `koulutus`, `Kokoomus` :laatusana: Adjectives; words like `maksuton` :nimisana_laatusana: Adjectives, that are not "real", like `rohkea` or `liberaali` :lyhenne: Abbrevations; Words like `EU` :paikannimi:Geographical locations, like `Helsinki` :sukunimi: Last names, like `Kekkonen` """ for morph in self.tokenizer().analyze(word): _class = morph.get("CLASS") if _class in ["nimi", "nimisana", "nimisana_laatusana", "lyhenne", "paikannimi", "sukunimi"]: return True else: logger.debug("Unsuitable word class %s for word %s", _class, word) return False def vector_words(self) -> List: """ Feature names in CountVector """ return self._count_vector.get_feature_names()
class LDA(Model): def __init__(self, reader, dataset='', topics=50, max_iter=20): self.lda = LatentDirichletAllocation(n_components=topics, max_iter=max_iter, learning_method='online', learning_offset=50., random_state=0) self.reader = reader self.dataset = dataset self.n_topics = topics self.train = np.array([x["doc_tm"] for x in self.reader.train]) self.valid = np.array([x["doc_tm"] for x in self.reader.valid]) self.test = np.array([x["doc_tm"] for x in self.reader.test]) self.lda.fit(self.train) def show_topics(self, model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % topic_idx message += " ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) print(message) print() def save_topic_distribution(self, save_path, n_top_words): model = self.lda feature_names = self.reader.idx2word str = "" for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % topic_idx message += " ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) str += message + "\n" str += "\n" print(str) with open(save_path, "a") as f: f.write(str) print("saved to", save_path) def LDA_recall(self, testset, n_samples=1000, n_recall=3, print_output=True): x = testset[:n_samples] t = self.lda.transform(x) x_hat = np.matmul(t, self.lda.components_) recall_tot = [] for i in range(np.shape(x)[0]): x_temp = x[i, :] x_hat_temp = x_hat[i, :] recall_tot.append(self.recall(x_temp, x_hat_temp, n_recall)) output = np.sum(recall_tot) / len(recall_tot) if print_output: print("recall", n_recall, "over", n_samples, ":", output) return output def get_topic_distribution(self, x): return self.lda.transform(x) def perplexity(self, testset, n_samples=100, print_errors=False): # Topic distribution x word distribution over topics (normalization over components is required) n_samples_real = n_samples x_hats = np.matmul(self.lda.transform(testset[:n_samples]), (self.lda.components_ / self.lda.components_.sum(axis=1)[:, np.newaxis])) perplexities = [] for i in range(n_samples): idxs = np.where(testset[i] > 0) x_hat = x_hats[i] probs = np.log(np.take(x_hat, idxs)) if len(probs[0]) == 0: n_samples_real -= 1 if print_errors: print("datapoint", i, "has no length, perplexity is now based on", n_samples_real, "samples.") continue perplexities.append(sum(probs[0]) / len(probs[0])) total_perplexity = np.exp( -sum(perplexities) / n_samples_real) #np.exp(- sum(perplexities) / len(perplexities)) print("LDA perplexity on test_set", total_perplexity) def experiments(self, save_location="topics.txt"): self.show_topics(self.lda, self.reader.idx2word, 10) feature_names = self.reader.idx2word n_top_words = 10 self.LDA_recall(self.test, print_output=True) self.perplexity(self.test) self.save_topic_distribution(save_location, 10)
df = pd.read_json(path) df.head() mask = ~df.loc[:, 'story'].isnull() & (df.loc[:, 'story'] != '') df = df.loc[mask, :] ############################################################################## # First model ############################################################################## x = df.loc[:, 'story'] cv = CountVectorizer(max_df=0.9, min_df=2, stop_words=full_stopwords) dtm = cv.fit_transform(x) lda = LatentDirichletAllocation(n_components=10, random_state=42) lda.fit(dtm) topic_results = lda.transform(dtm) df.loc[:, 'topic_id'] = topic_results.argmax(axis=1) + 1 args = [ lda, cv, df, ] create_df_topic_word_lists(*args, verbose=True) ############################################################################## # Remove other languages ############################################################################## df.loc[:, 'language'] = df.loc[:, 'story'].apply(lambda x: detect(str(x)))
def process(db_msg): logger, log_stream = slog.set_logging('topic_identification', loglevel=api.config.debug_mode) logger.info("Process started") time_monitor = tp.progress() columns = [c['name'] for c in db_msg.attributes['table']['columns']] df = pd.DataFrame(db_msg.body, columns=columns) # Language filter language_filter = tfp.read_list(api.config.language_filter) if language_filter: df = df.loc[df["LANGUAGE"].isin(language_filter)] else: language_filter = list(df['LANGUAGE'].unique()) logger.info('Languages : {}'.format(language_filter)) # Word type filter word_type_filter = tfp.read_value(api.config.word_type_filter) if word_type_filter: types = [c for c in word_type_filter] df = df.loc[df["TYPE"].isin(types)] logger.info('Word restricted to types : {}'.format(word_type_filter)) # groupby and concatenate words gdf = df.groupby(by=['HASH_TEXT', 'LANGUAGE'])['WORD'].apply( lambda x: ' '.join(x)).reset_index() logger.info('Topic identification: ') for lang in language_filter: logger.info('Language: {} #Documents: {} #Words: {}'.format(lang,gdf.loc[gdf['LANGUAGE']==lang].shape[0],\ df.loc[df['LANGUAGE'] == lang].shape[0])) api.send(outports[0]['name'], log_stream.getvalue()) log_stream.seek(0) # create document-term matrix - no tokenization or text prep are needed tf_vectorizer = CountVectorizer(analyzer='word', min_df=1, lowercase=False, tokenizer=str.split) # tf means term-frequency in a document for each language date_today = str(date.today()) # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics) topic_list = list() for lang in language_filter: logger.info('Process all texts for language: {}'.format(lang)) lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang] dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORD']) # for tf dtm lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics, learning_method='online', evaluate_every=-1, n_jobs=-1) lda_tf.fit(dtm_tf) feature_names = tf_vectorizer.get_feature_names() for i, topic in enumerate(lda_tf.components_): topic_words = [ feature_names[f] for f in topic.argsort()[:-api.config.topic_num_words - 1:-1] ] logger.debug('Len: {} topic_words:{}'.format( len(topic_words), topic_words)) row = [ date_today + "-" + str(i), lang, 'ALGO', date_today, None, None ] + topic_words topic_list.append(row) attributes = { "table": { "columns": [{ "class": "string", "name": "TOPIC", "nullable": False, "size": 80, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "LANGUAGE", "nullable": False, "size": 2, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "TYPE", "nullable": False, "size": 10, "type": { "hana": "NVARCHAR" } }, { "class": "string", "name": "DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "EXPIRY_DATE", "nullable": True, "type": { "hana": "DATE" } }, { "class": "string", "name": "ATTRIBUTE", "nullable": True, "size": 25, "type": { "hana": "NVACHAR" } }], "name": "DIPROJECTS.WORD_INDEX", "version": 1 } } for i in range(1, api.config.topic_num_words + 1): attributes['table']['columns'].append({ "class": "string", "name": "KEYWORD_" + str(i), "nullable": True, "size": 80, "type": { "hana": "NVARCHAR" } }) msg = api.Message(attributes=attributes, body=topic_list) logger.debug('Process ended, topics processed {}'.format( time_monitor.elapsed_time())) api.send(outports[0]['name'], log_stream.getvalue()) api.send(outports[1]['name'], msg)