def applyLDA2(self, number_of_clusters, country_specific_tweets): train, feature_names = self.extractFeatures(country_specific_tweets,False) name = "lda" if self.results: print("Fitting LDA model with tfidf", end= " - ") t0 = time() lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(train) if self.results: print("done in %0.3fs." % (time() - t0)) parameters = lda.get_params() topics = lda.components_ doc_topic = lda.transform(train) top10, labels = self.printTopicCluster(topics, doc_topic, feature_names) labels = numpy.asarray(labels) if self.results: print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels))) return name, parameters, top10, labels
def score_lda(src, dst): ##read sentence pairs to two lists b1 = [] b2 = [] lines = 0 with open(src) as p: for i, line in enumerate(p): s = line.split('\t') b1.append(s[0]) b2.append(s[1][:-1]) #remove \n lines = i + 1 vectorizer = CountVectorizer() vectors=vectorizer.fit_transform(b1 + b2) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) X = lda.fit_transform(vectors) print X.shape b1_v = vectorizer.transform(b1) b2_v = vectorizer.transform(b2) b1_vecs = lda.transform(b1_v) b2_vecs = lda.transform(b2_v) res = [round(5*(1 - spatial.distance.cosine(b1_vecs[i], b2_vecs[i])),2) for i in range(lines)] with open(dst, 'w') as thefile: thefile.write("\n".join(str(i) for i in res))
def fit_lda(tf): '''takes in a tf sparse vector and finds the top topics''' lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() lda_topic_dict = print_top_words(lda, tf_feature_names, n_top_words) return lda, lda_topic_dict
def LDA(tf,word): lda = LatentDirichletAllocation(n_topics=30, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) print_top_words(lda,word,20)
def basic_lda(df, n_topics=200, max_df=0.5, min_df=5): ''' Basic LDA model for album recommendations Args: df: dataframe with Pitchfork reviews n_topics: number of lda topics max_df: max_df in TfidfVectorizer min_df: min_df in TfidfVectorizer Returns: tfidf: sklearn fitted TfidfVectorizer tfidf_trans: sparse matrix with tfidf transformed data lda: sklearn fitted LatentDirichletAllocation lda_trans: dense array with lda transformed data ''' X = df['review'] cv = CountVectorizer(stop_words='english', min_df=5, max_df=0.5) cv_trans = cv.fit_transform(X) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=7) lda_trans = lda.fit_transform(cv_trans) return cv, cv_trans, lda, lda_trans
def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3) print("latent_dirichlet_allocation_cos: fit and transform body") t0 = time() lda_body_matrix = lda_body.fit_transform(X_train_body) print("done in %0.3fs." % (time() - t0)) print("latent_dirichlet_allocation_cos: transform head") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar lda_head_matrix = lda_body.transform(X_train_head) #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100) print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body') # calculate cosine distance between the body and head X = [] for i in range(len(lda_head_matrix)): X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def plot_perplexity_iter(A_tfidf, num_topics): print "computing perplexity vs iter..." max_iter = 5 perplexity = [] em_iter = [] for sweep in range(1,max_iter+1): lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_iter.npy', perplexity) f = plt.figure() plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_iter.png')
def plot_perplexity_batch(A_tfidf, num_docs): print "computing perplexity vs batch size..." max_iter = 5 num_topics = 10 batch_size = np.logspace(6, 10, 5, base=2).astype(int) perplexity = np.zeros((len(batch_size),max_iter)) em_iter = np.zeros((len(batch_size),max_iter)) for ii, mini_batch in enumerate(batch_size): for jj, sweep in enumerate(range(1,max_iter+1)): lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = mini_batch, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic) perplexity[ii,jj] = lda.perplexity(A_tfidf) em_iter[ii,jj] = lda.n_batch_iter_ #end #end np.save('./data/perplexity.npy', perplexity) np.save('./data/em_iter.npy', em_iter) f = plt.figure() for mb in range(len(batch_size)): plt.plot(em_iter[mb,:], perplexity[mb,:], color=np.random.rand(3,), marker='o', lw=2.0, label='mini_batch: '+str(batch_size[mb])) plt.title('Perplexity (LDA, online VB)') plt.xlabel('EM iter') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_batch.png')
def plot_perplexity_topics(A_tfidf): print "computing perplexity vs K..." max_iter = 5 #based on plot_perplexity_iter() #num_topics = np.linspace(2,20,5).astype(np.int) num_topics = np.logspace(1,2,5).astype(np.int) perplexity = [] em_iter = [] for k in num_topics: lda = LatentDirichletAllocation(n_topics = k, max_iter=max_iter, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time() lda.fit(A_tfidf) #online VB toc = time() print "K= %d, elapsed time: %.4f sec" %(k, toc - tic) perplexity.append(lda.perplexity(A_tfidf)) em_iter.append(lda.n_batch_iter_) #end np.save('./data/perplexity_topics.npy', perplexity) np.save('./data/perplexity_topics2.npy', num_topics) f = plt.figure() plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity') plt.title('Perplexity (LDA, online VB)') plt.xlabel('Number of Topics, K') plt.ylabel('Perplexity') plt.grid(True) plt.legend() plt.show() f.savefig('./figures/perplexity_topics.png')
def lda_tuner(ingroup_otu, best_models): best_score = -1*np.inf dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2] topic_series = [3] X = ingroup_otu.values eval_counter = 0 for topics in topic_series: for dtp in dtp_series: for twp in twp_series: eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=dtp, topic_word_prior=twp, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, topics, dtp, twp, this_score, this_perplexity) best_models.append({'n': topics, 'dtp': dtp, 'twp': twp, 'score': this_score, 'perp': this_perplexity}) if (dtp == dtp_series[-1]) and (twp == twp_series[-1]): eval_counter +=1 X_train, X_test = train_test_split(X, test_size=0.5) lda = LatentDirichletAllocation(n_topics=topics, doc_topic_prior=1./topics, topic_word_prior=1./topics, learning_method='batch', random_state=42, max_iter=20) lda.fit(X_train) this_score = lda.score(X_test) this_perplexity = lda.perplexity(X_test) if this_score > best_score: best_score = this_score print "New Max Likelihood: {}".format(best_score) print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, topics, (1./topics), (1./topics), this_score, this_perplexity) best_models.append({'n': topics, 'dtp': (1./topics), 'twp': (1./topics), 'score': this_score, 'perp': this_perplexity}) return best_models
def extractTopicLDA(func_message_dic, store_cloumn): if len(func_message_dic) == 0: print "func_message_dic is null" return False try: conn=MySQLdb.connect(host='192.168.162.122',user='******',passwd='123456',port=3306) cur=conn.cursor() cur.execute('set names utf8mb4') conn.select_db('codeAnalysis') for function in func_message_dic: message = func_message_dic[function] np_extractor = nlp.semantics_extraction.NPExtractor(message) text = np_extractor.extract() if len(text) == 0: continue tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(text) print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() seprator = " " for topic_idx, topic in enumerate(lda.components_): keywords = seprator.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) sql = "update func_semantic set "+store_cloumn+" = '"+keywords+"' where func_name = '"+function+"'" print sql cur.execute(sql) conn.commit() cur.close() conn.close() return True except MySQLdb.Error,e: print e raise
def _get_model_LDA(self, corpus): #lda = models.LdaModel(corpus, id2word=self.corpus.dictionary, num_topics=5, alpha='auto', eval_every=50) lda = LatentDirichletAllocation(n_topics=self.num_of_clusters, max_iter=20, learning_method='online', learning_offset=50., random_state=1) return lda.fit_transform(corpus)
def produceLDATopics(): ''' Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer to extract topics. :return: pandas data frame with topic weights for each game (rows) and topic (columns) ''' data_samples, gameNames = create_game_profile_df(game_path) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topics = lda.fit_transform(tf) # for i in range(50): # gameTopics = [] # for j in range(len(topics[0])): # if topics[i,j] > 1.0/float(n_topics): # gameTopics.append(j) # print gameNames[i], gameTopics topicsByGame = pandas.DataFrame(topics) topicsByGame.index = gameNames print topicsByGame tf_feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) return topicsByGame
def topicmodel( comments ): _texts = [] texts = [] for c in comments: c = c['text'] _texts.append( c ) texts.append( c ) tf_vectorizer = CountVectorizer( max_df=.20, min_df=10, stop_words = stopwords ) texts = tf_vectorizer.fit_transform( texts ) ## test between 2 and 20 topics topics = {} for k in range(2, 10): print "Testing", k model = LatentDirichletAllocation( n_topics= k , max_iter=5, learning_method='batch', learning_offset=50., random_state=0 ) model.fit( texts ) ll = model.score( texts ) topics[ ll ] = model topic = max( topics.keys() ) ret = collections.defaultdict( list ) ## ugly, rewrite some day model = topics[ topic ] ## for debug pront chosen models' names feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print "Topic #%d:" % topic_idx print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]]) print for i, topic in enumerate( model.transform( texts ) ): topic = numpy.argmax( topic ) text = _texts[ i ].encode('utf8') ret[ topic ].append( text ) return ret
def latdirall(content): lda = LatentDirichletAllocation(n_topics=10) tf_vectorizer = TfidfVectorizer(max_df=0.99, min_df=1, stop_words='english') tf = tf_vectorizer.fit_transform(content) lolz = lda.fit_transform(tf) tfidf_feature_names = tf_vectorizer.get_feature_names() return top_topics(lda, tfidf_feature_names, 10)
class LDATopics: # Constructor def __init__(self, filename): # Member variables self.email_data = [] self.lda = None self.feature_names = None self.num_topics = NUM_TOPICS self.num_words_per_topic = NUM_WORDS_PER_TOPIC self.num_features = NUM_FEATURES # Load emails from full path to file emails = EmailLoader(filename).get_email_dict_array() # Process emails into a list of email body contents for email_rec in emails: if email_rec['body']: # Clean the text and add to list cleaner = TextCleaner(email_rec['body']) self.email_data.append(" ".join(cleaner.tokenize_str())) ## Public methods ## def process(self, topics=None, features=None): # Check if default numbers should be used if topics is None: topics = self.num_topics if features is None: features = self.num_features # Calculate term frequency for LDA tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=features, stop_words='english') tf = tf_vectorizer.fit_transform(self.email_data) # Fit the LDA model to data samples self.lda = LatentDirichletAllocation(n_topics=topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) self.lda.fit(tf) # Set the feature name (words) self.feature_names = tf_vectorizer.get_feature_names() def print_topics(self, words_per_topic=None): # Check if default number of words per topics should be used if words_per_topic is None: words_per_topic = self.num_words_per_topic self._print_topics(self.lda, self.feature_names, words_per_topic) ## Private methods ## def _print_topics(self, model, feature_names, words_per_topic): for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-words_per_topic - 1:-1]])) print()
def perform_analysis(self, stocks, szTimeAxis, n_ahead): # load Snowball comment data from agares.datasource.snowball_cmt_loader import SnowballCmtLoader SBLoader = SnowballCmtLoader() date = self.dt_start.date() df_cmt_list = [] while date <= self.dt_end.date(): df_cmt_list.append(SBLoader.load(str(date))) date += timedelta(days=1) df_cmt = pd.concat(df_cmt_list, ignore_index=True) # Chinese text segmentation self.set_jieba() df_cmt['RawComment'] = df_cmt['RawComment'].map(jieba.cut) # drop stopwords self.stopwords = [line.strip() for line in open('stopwords').readlines()] self.stopwords.append(' ') df_cmt['RawComment'] = df_cmt['RawComment'].map(self.drop_useless_word) cmt = df_cmt['RawComment'].tolist() # construct tfidf matrix tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=0.05) tfidf = tfidf_vectorizer.fit_transform(cmt) # Fit the NMF model n_topics = 5 n_top_words = 20 print("Fitting the NMF model with tf-idf features..") t0 = time() nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() self.print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the LDA model print("Fitting LDA models with tf-idf features..") lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") self.print_top_words(lda, tfidf_feature_names, n_top_words) # load sz daily candlestick data sz = next(iter(stocks)) cst_Day = stocks[sz].cst['1Day'] # print close price within the timescope date = self.dt_start print() print("The ShangHai stock Index (close index) within the timescope") while date <= self.dt_end: ts = pd.to_datetime(date) try: print("Date: {0:s}, Index: {1:.2f}".format(str(date.date()), cst_Day.at[ts, 'close'])) except KeyError: # sz candlestick data does not exist at this datetime print("Date: {0:s}, Index: (market closed)".format(str(date.date()))) date += timedelta(days=1)
def LDA(matrix,preserve,n_topics=100): lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method='online', learning_offset=50., random_state=randint(1,100)) lda.fit(matrix[preserve]) topic_model=lda.transform(matrix) return topic_model
def test_lda_transform(): # Test LDA transform. # Transform result cannot be negative rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_topics = 3 lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any())
def test_lda_transform(): # Test LDA transform. # Transform result cannot be negative and should be normalized rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_topics = 3 lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any()) assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
def test_lda_fit_transform(method): # Test LDA fit_transform & transform # fit_transform and transform result should be the same rng = np.random.RandomState(0) X = rng.randint(10, size=(50, 20)) lda = LatentDirichletAllocation(n_components=5, learning_method=method, random_state=rng) X_fit = lda.fit_transform(X) X_trans = lda.transform(X) assert_array_almost_equal(X_fit, X_trans, 4)
def test_lda_transform_mismatch(): # test `n_features` mismatch in partial_fit and transform rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) lda.partial_fit(X) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_partial_fit_dim_mismatch(): # test `n_features` mismatch in `partial_fit` rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., total_samples=20, random_state=rng) lda.partial_fit(X_1) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_perplexity_input_format(): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) lda.fit(X) perp_1 = lda.perplexity(X) perp_2 = lda.perplexity(X.toarray()) assert_almost_equal(perp_1, perp_2)
def get_lda(): lda = LatentDirichletAllocation( n_topics=K, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(X) tf_feature_names = VECTORIZER.get_feature_names() print_top_words(lda, tf_feature_names, 10) return lda
def calculate_lda(self, tfidf): print("Fitting LDA models with tf features...") lda = LatentDirichletAllocation(n_topics=self.num_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tfidf) print("Topics in LDA model:") print_top_words(lda, self.tfidf_feature_names, self.num_words) print("done in %0.3fs." % (time() - t0))
def test_doc_topic_distr_deprecation(): # Test that the appropriate warning message is displayed when a user # attempts to pass the doc_topic_distr argument to the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) distr1 = lda.fit_transform(X) distr2 = None assert_warns(DeprecationWarning, lda.perplexity, X, distr1) assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
def test_lda_default_prior_params(): # default prior parameter should be `1 / topics` # and verbose params should not affect result n_topics, X = _build_sparse_mtx() prior = 1. / n_topics lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior, topic_word_prior=prior, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) assert_almost_equal(topic_distr_1, topic_distr_2)
def latdirall(content): lda = LatentDirichletAllocation(n_topics=5) tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(content) lolz = lda.fit_transform(tf) tfidf_feature_names = tf_vectorizer.get_feature_names() tops = top_topics(lda, tfidf_feature_names, 10) wordlist = [] for topic in tops: wordlist += topic return wordlist
transformer = TfidfTransformer() cntTf = vectorizer.fit_transform(apps) tfidf = transformer.fit_transform(cntTf) word = vectorizer.get_feature_names() weight = tfidf.toarray() df_weight = pd.DataFrame(weight) feature = df_weight.columns df_weight['sum'] = 0 for f in tqdm(feature): df_weight['sum'] += df_weight[f] deviceid_packages['tfidf_sum'] = df_weight['sum'] # In[10]: lda = LatentDirichletAllocation(n_topics=5, learning_offset=50., random_state=666) docres = lda.fit_transform(cntTf) # In[11]: deviceid_packages = pd.concat( [deviceid_packages, pd.DataFrame(docres)], axis=1) # In[12]: temp = deviceid_packages.drop('apps', axis=1) deviceid_train = pd.merge(deviceid_train, temp, on='device_id', how='left') # In[13]:
############################################# 降维 ###################################################### # user_action_ = user_action[user_action['time']<'2017-04-01'] user_action_ = user_action[user_action['a_date']<'2017-04-01'] mapping = {} for sample in user_action_[['user_id', 'sku_id']].values: mapping.setdefault(sample[0], []).append(str(sample[1])) cate1s = list(mapping.keys()) print(len(cate1s)) cate2_as_sentence = [' '.join(mapping[cate_]) for cate_ in cate1s] cate2_as_matrix = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=2).fit_transform(cate2_as_sentence) lda = LDA(n_components=5, learning_method='online', batch_size=1000, n_jobs=40, random_state=520) topics_of_cate1 = lda.fit_transform(cate2_as_matrix) topics_of_cate1 = pd.DataFrame(topics_of_cate1, columns=["%s_%s_lda_action" % ('user_sku', i) for i in range(5)]).astype('float32') topics_of_cate1['user_id'] = cate1s topics_of_cate1.to_hdf(cache_path + 'p1.hdf', 'w') nmf = NMF(n_components=5, random_state=520, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5)
def main(file, userselection, n_pois, output, desc): df_reviews = pd.read_csv(file).drop_duplicates(subset=['Tripadvisor'], keep='first') #Lower all words df_reviews['description'] = df_reviews['description'].str.lower() #Numeric to strings df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub(r'\d+', '', x)) #remove html tags df_reviews['description'] = df_reviews['description'].apply( lambda x: (html.unescape(x))) #remove punctuation df_reviews['description'] = df_reviews['description'].apply( lambda x: x.translate(str.maketrans('', '', string.punctuation))) #remove accent df_reviews['description'] = df_reviews['description'].apply( lambda x: unidecode.unidecode(x)) #remove specific characters and words df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub("description", '', x)) df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub("wikipedia", '', x)) df_reviews['description'] = df_reviews['description'].apply( lambda x: re.sub("'s", '', x)) #stop words stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() df_reviews['tokens'] = df_reviews['description'].apply(lambda x: [ lemmatizer.lemmatize(word) for word in word_tokenize(x) if not word in stop_words and detect_language(word) == 'English' ]) #get corpus corpus = get_corpus(df_reviews) seg_list = split_to_words(corpus) vectorizer_model = CountVectorizer(stop_words=stop_words, analyzer='word', max_features=2000) vec_docs = vectorizer_model.fit_transform(seg_list) tf_feature_names = vectorizer_model.get_feature_names() no_topics = 10 no_top_words = 5 lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=40., random_state=0).fit(vec_docs) display_topics(lda, tf_feature_names, no_top_words) lda_results = lda.fit_transform(vec_docs) df_reviews = get_topics(lda_results, df_reviews) topic_dict = display_topics(lda, tf_feature_names, no_top_words) h = pd.DataFrame.from_dict(topic_dict, orient='index').transpose().melt() df_reviews = df_reviews.merge(h, left_on='topics', right_on='variable', how='left') df_reviews = df_reviews.drop(columns=['topics', 'variable', 'tokens']) df_reviews = pd.get_dummies(df_reviews, prefix=['keyword'], columns=['value']).drop_duplicates() cols = [col for col in df_reviews.columns if 'keyword' not in col] df_reviews = df_reviews.groupby(cols).sum().reset_index() df_reviews = review_rate(df_reviews) selection = user_selection(userselection) results = best_results(dict_user(df_reviews, selection), df_reviews, n_pois, relevant=userselection) if output != "default": results.to_csv("{output}_{sufix}.csv".format(output=output, sufix=selection), index=False) desc_ = pd.read_csv(desc, sep="|") results = results.merge(desc_, left_on='Tripadvisor', right_on='name', how='inner') return results
# #TODO Eu queria mostrar quais combinações de tokens os vetorizadores usaram como features. # ## 6. Topic Modelling # One way to organize those feature vectors is to search for unsupervisionised patterns inside data to form topics and then use those topics to classify. # ### 6.1. Generating topics # In[42]: num_topics = 30 number_words = 10 ''' Creating and fit the LDA model using the count_vectors generated before ''' lda = LDA(n_components=num_topics, max_iter = 20, n_jobs=-1) topics_vectors = lda.fit_transform(count_vectors) ''' Printing the topics found by the LDA model ''' print("Topics found via LDA:") words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("\nTopic #%d:" % topic_idx) print(" ".join([words[i] for i in topic.argsort()[:-number_words - 1:-1]])) # Let's see how the topics found are related to each other # In[49]:
def summarize(self, text, num=100, topic_min=3, judge_topic=None): """ :param text: str :param num: int :return: list """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") len_sentences_cut = len(self.sentences) # 切词 sentences_cut = [[ word for word in list(jieba.cut(sentence)) if word.strip() ] for sentence in self.sentences] # sentences_cut = [[word for word in jieba_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences] # 去除停用词等 self.sentences_cut = [ list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut ] self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] print(sentences_cut) # 计算每个句子的tf vector_c = CountVectorizer(ngram_range=(1, 2), stop_words=self.stop_words) tf_ngram = vector_c.fit_transform(self.sentences_cut) # 主题数, 经验判断 topic_num = min(topic_min, int(len(sentences_cut) / 2)) # 设定最小主题数为3 print('topic_num', topic_num) lda = LatentDirichletAllocation(n_components=topic_num, max_iter=32, learning_method='online', learning_offset=50., random_state=2019) res_lda_u = lda.fit_transform(tf_ngram.T) res_lda_v = lda.components_ print('res_lda_v', res_lda_v) # 各个主题在各个文档上分配的概率 if judge_topic: ### 方案一, 获取最大那个主题的k个句子 ################################################################################## topic_t_score = np.sum(res_lda_v, axis=-1) print('topic_t_score', topic_t_score) # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1] # 统计为最大每个主题的句子个数 exist = (res_nmf_h_soft <= 0) * 1.0 factor = np.ones(res_nmf_h_soft.shape[1]) topic_t_count = np.dot(exist, factor) # 标准化 topic_t_count /= np.sum(topic_t_count, axis=-1) topic_t_score /= np.sum(topic_t_score, axis=-1) # 主题最大个数占比, 与主题总得分占比选择最大的主题 topic_t_tc = topic_t_count + topic_t_score topic_t_tc_argmax = np.argmax(topic_t_tc) # 最后得分选择该最大主题的 res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist() res_combine = {} for l in range(len_sentences_cut): res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] score_sen = [(rc[1], rc[0]) for rc in sorted( res_combine.items(), key=lambda d: d[1], reverse=True)] ##################################################################################### else: ### 方案二, 获取最大主题概率的句子, 不分主题 res_combine = {} for i in range(len_sentences_cut): res_row_i = res_lda_v[:, i] res_row_i_argmax = np.argmax(res_row_i) res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] score_sen = [(rc[1], rc[0]) for rc in sorted( res_combine.items(), key=lambda d: d[1], reverse=True)] num_min = min(num, len(self.sentences)) return score_sen[0:num_min]
# In[35]: feedback = [] for y, santens in enumerate(train_headlines_sentens): # Now, we obtain a Counts design matrix, for which we use SKLearn’s CountVectorizer module. The transformation will return a matrix of size (Documents x Features), where the value of a cell is going to be the number of times the feature (word) appears in that document. # To reduce the size of the matrix, to speed up computation, we will set the maximum feature size to 5000, which will take the top 5000 best features that can contribute to our model. vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 3), max_features=5000) x_coints = vectorizer.fit_transform(santens) transformer = TfidfTransformer(smooth_idf=False) x_tfidf = transformer.fit_transform(x_coints) xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1) model = LatentDirichletAllocation(n_components=num_topics, learning_method='online') model.fit(xtfidf_norm) toPik = get_lda_topics(lda, 7) print('tanggal', data_tgl[y]) print('\n', 'konsep matrix \n', toPik) feedback.append(toPik) # In[36]: model.fit(x_coints) # In[37]: x_coints.shape # In[38]:
def _lda3(table, input_col, topic_name='topic', num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") voca_weights_list = [] for weights in lda_model.components_: pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [doc_topic[i].argmax() for i in range(len(corpus))] weight_list = [] for ind in out_table[topic_name]: weight_list.append(voca_weights_list[ind]) out_table['topic_vocabularies'] = weight_list return {'out_table': out_table}
from nltk.stem import WordNetLemmatizer from nltk.sentiment.vader import SentimentIntensityAnalyzer with open("../data/preprocessed_data.pk", 'rb') as fp: result = pickle.load(fp) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') result = list(set(result)) tf = tf_vectorizer.fit_transform(result) tf_feature_names = tf_vectorizer.get_feature_names() # tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, tokenizer = word_tokenize, stop_words='english') # result = list(set(result)) # tfidf = tfidf_vectorizer.fit_transform(result) # tfidf_feature_names = tfidf_vectorizer.get_feature_names() # Run LDA no_topics = 3 # nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) lda = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', learning_offset=50., random_state=0).fit(tf) with open("../data/lda_data.pk", 'wb') as fp: pickle.dump([tf_vectorizer, tf, lda], fp) print('LDA models secured')
bow=bow_vectorizer.fit_transform(clean_text) word_counts=bow.toarray() tfidf_transformer=TfidfTransformer() tfidf=tfidf_transformer.fit_transform(word_counts) #dimension reduction from sklearn.decomposition import TruncatedSVD TSVD = TruncatedSVD(n_components=200, algorithm = "randomized", n_iter = 5) TSVD_fit=TSVD.fit(tfidf) TSVD_reduced=TSVD.fit_transform(tfidf) # Latent Dirchlet Allocation from sklearn.decomposition import LatentDirichletAllocation lda_ = LatentDirichletAllocation(n_components=50, max_iter=500, learning_method='online', learning_offset=50., total_samples = len(clean_text), random_state=0) lda_tx=lda_.fit_transform(word_counts) #fit transform #save models - especially important for LDA taking so long to run import pickle from sklearn.externals import joblib #joblib.dump(lda_, 'filename.pkl') #joblib.dump(lda_tx, 'lda_tx.pkl') # pickle.dump(lda_,open('lda_output.txt','wb')) m_list1=[' '.join(el) for el in mesh_list] #comes in as list of lists #MeSH term TF IDF Mesh_bow_vectorizer=CountVectorizer()
with open(my_stop_words_path, errors='ignore') as fr: for line in fr.readlines(): stop_words_dict.append(line.strip()) print('停用词数={}'.format(len(stop_words_dict))) # 鬼吹灯文本挖掘4:LDA模型提取文档主题 sklearn LatentDirichletAllocation和gensim LdaModel # 注:tfidf_mat数据准备可参考鬼吹灯文本挖掘3 import pickle tfidf_mat = pickle.load(open('tfidf_mat.txt', 'rb')) # 1. Sklearn实现LDA模型,并提取文档主题 # (1)其中参数n_topics是主题个数,max_iter是迭代次数 # (2)lda_model.components_中每行代表一个主题,每行中的每个元素代表对应词属于这个主题的得分 from sklearn.decomposition import LatentDirichletAllocation n_topics = 8 # 自定义主题个数#DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21 lda_model = LatentDirichletAllocation(n_topics=n_topics, max_iter=10) # 使用TF-IDF矩阵拟合LDA模型 lda_model.fit(tfidf_mat) # 拟合后模型的实质 print(lda_model.components_.shape) print(lda_model.components_[:2]) # (8, 1654) # Out[105]: # array([[0.30237038, 0.29720752, 0.31504618, ..., 0.33985295, 0.2906448 , # 0.3043558 ], # [0.29870912, 0.30435234, 0.31793515, ..., 0.3215601 , 0.32073196, # 0.31859002]]) # (3)其中argsort() 取元素的索引值,并将指最小的元素对应的索引值放在最前面,依次按元素值的大小顺序排列。
def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print(" ".join( [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) tokens = df["query"].apply(nlp) tokens = map(lambda text: map(lambda x: x.lemma_, text), tokens) query_list = [ " ".join( map(lambda x: str(x) if not nlp.vocab[str(x)].is_stop else "", text)) for text in tokens ] lda = LDA(n_components=5) count_vectorizer = CountVectorizer() count_data = count_vectorizer.fit_transform(query_list) output = lda.fit(count_data) print_topics(lda, count_vectorizer, 1) all_t_lemma_stop = [ " ".join( map(lambda x: str(x) if not nlp.vocab[str(x)].is_stop else "", text)) for text in tokens ] all_t = ' '.join(map(str, all_t_lemma_stop)) filtered_words = [word for word in str(all_t).split()] counted_words = collections.Counter(filtered_words)
min_df = 0.002) #去除文档内出现几率过大或过小的词汇 tf = tf_vectorizer.fit_transform(corpus) print(tf.shape) print(tf) #------------------------- 第三步 LDA分析 ------------------------ from sklearn.decomposition import LatentDirichletAllocation # 设置主题数 n_topics = 2 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=100, learning_method='online', learning_offset=50, random_state=0) lda.fit(tf) # 显示主题数 model.topic_word_ print(lda.components_) # 几个主题就是几行 多少个关键词就是几列 print(lda.components_.shape) # 计算困惑度 print(u'困惑度:') print(lda.perplexity(tf,sub_sampling = False)) # 主题-关键词分布 def print_top_words(model, tf_feature_names, n_top_words):
# count tf-idf tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2) tfidf = tfidf_vectorizer.fit_transform(tweets) tfidf_feature_names = tfidf_vectorizer.get_feature_names() print 'shape *******' print tf.shape dist = 1 - cosine_similarity(tfidf) print no_topics = 2 # Start Clustering # lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=100, learning_method='online', learning_offset=50., random_state=0).fit(tf) nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) # print top tf-idf words # def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents): for topic_idx, topic in enumerate(H): print "Cluster %d: " % (topic_idx) print "".join([
# #### Count Vectorizer from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english') doc_term_matrix = count_vect.fit_transform( reviews_datasets['Text'].values.astype('U')) doc_term_matrix # #### Use LDA from sklearn.decomposition import LatentDirichletAllocation LDA = LatentDirichletAllocation(n_components=5, random_state=42) LDA.fit(doc_term_matrix) """ Randomly fetches 10 words from our vocabulary """ import random for i in range(10): random_id = random.randint(0, len(count_vect.get_feature_names())) print(count_vect.get_feature_names()[random_id]) first_topic = LDA.components_[0] top_topic_words = first_topic.argsort()[-10:] for i in top_topic_words:
print(df.head(3)) print(df) df = df.head(260000) from sklearn.feature_extraction.text import CountVectorizer count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000) X = count.fit_transform(df['review'].values) # In[3]: from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=14, random_state=123, learning_method='batch') X_topics = lda.fit_transform(X) # In[4]: lda.components_.shape # In[5]: n_top_words = 15 feature_names = count.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic %d:" % (topic_idx + 1)) print(" ".join([feature_names[i]
corpus = [ 'bread bread bread bread bread bread bread bread bread bread', 'milk milk milk milk milk milk milk milk milk milk', 'pet pet pet pet pet pet pet pet pet pet', 'bread bread bread bread bread bread bread bread bread bread milk milk milk milk milk milk milk milk milk milk' ] from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() matrix_X = vec.fit_transform(corpus) from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=2, topic_word_prior=0.1, doc_topic_prior=0.1) lda.fit(matrix_X) for topic in lda.components_: print([topic[t] for t in topic.argsort()[::-1]]) print(lda.transform(matrix_X))
plt.xticks(x_pos, words, rotation=90) plt.xlabel('words') plt.ylabel('counts') plt.show() # Initialise the count vectorizer with the English stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform(padfpers['text']) # Visualise the 10 most common words plot_10_most_common_words(count_data, count_vectorizer) warnings.simplefilter("ignore", DeprecationWarning) # Load the LDA model from sk-learn top_words = list() # Helper function def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): top_words.extend(([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) # Tweak the two parameters below number_topics = 50 number_words = 100 # Create and fit the LDA model res = LDA(n_components=number_topics, n_jobs=-1) res.fit(count_data) # Print the topics found by the LDA model print_topics(res, count_vectorizer, number_words)
doc_id = read_doc_list() for query in doc_list: print len(doc_list[query]) data_samples = build_matrix(doc_list[query]) tfidf_vectorizer = TfidfVectorizer( max_df=0.95, min_df=2, #max_features=n_features, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(data_samples) tfidf_feature_names = tfidf_vectorizer.get_feature_names() lda = LatentDirichletAllocation(n_topics=20, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tfidf) of = open("./../data/query_result.txt", "a") of.write("Query " + str(query) + "\n") print_top_words(lda, tfidf_feature_names, of) of.write("\n") doc_topic = lda.transform(tfidf) of2 = open("./../data/doc_result.txt", "a") of2.write("Query " + str(query) + "\n") print_doc_topic(doc_topic, doc_id, query, of2) of2.write("\n")
#--- Test set test_set = [] y_test = [] u_test = [] i = -1 for text in entity: i += 1 if sampling[i] != 3: continue test_set.append(text) y_test.append(Score[i]) u_test.append(UserId[i]) X_new_counts = count_vect.transform(test_set) model_lda = LatentDirichletAllocation(n_topics=100) X_train_lda = model_lda.fit_transform(X_train_counts) X_test_lda = model_lda.transform(X_new_counts) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn import linear_model ans_simple = OneVsRestClassifier(LinearSVC(random_state=0)).fit( X_train_lda, y_train).predict(X_test_lda) rmse_simple = 0 mae_simple = 0 for i, ans in izip(range(0, len(y_test)), y_test): mae_simple += abs(ans_simple[i] - ans) rmse_simple += (ans_simple[i] - ans)**2 print 'mae_SVM', mae_simple * 1.0 / (len(y_test)) print 'rmse_SVM', (rmse_simple * 1.0 / (len(y_test)))**(0.5) for name2, algo in izip(['UBR-1', 'UBR-2'], [1, 2]):
def _lda(table, input_col, num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = table[input_col] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") topic_model = pd.DataFrame([]) topic_idx_list = [] voca_weights_list = [] for topic_idx, weights in enumerate(lda_model.components_): topic_idx_list.append("Topic {}".format(topic_idx)) pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) topic_model['topic idx'] = topic_idx_list topic_model['topic vocabularies'] = voca_weights_list doc_topic = lda_model.transform(term_count) doc_classification = pd.DataFrame() doc_classification['documents'] = [doc for doc in corpus] doc_classification['top topic'] = [ "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus)) ] params = { 'Input Column': input_col, 'Number of Vocabularies': num_voca, 'Number of Topics': num_topic, 'Number of Terminologies': num_topic_word, 'Iterations': max_iter, 'Learning Method': learning_method, } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result""")) rb.addMD( strip_margin(""" | |### Parameters | | {display_params} | |### Topic Model | |{topic_model} | |### Documents Classification | |{doc_classification} | """.format(display_params=dict2MD(params), topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1), doc_classification=pandasDF2MD(doc_classification, num_rows=len(corpus) + 1)))) model = _model_dict('lda') model['parameter'] = params model['topic_model'] = topic_model model['documents_classification'] = doc_classification model['_repr_brtc_'] = rb.get() return {'model': model}
def __call__(self, doc): return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") tf_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=N_FEATURES, stop_words='english', tokenizer=LemmaTokenizer()) t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print("Fitting LDA models with tf features, " "N_SAMPLES=%d and N_FEATURES=%d..." % (N_SAMPLES, N_FEATURES)) lda = LatentDirichletAllocation(n_topics=N_TOPICS, max_iter=20, learning_method='batch', learning_offset=50., random_state=0) t0 = time() lda.fit(tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, N_TOP_WORDS)
#load in the samples: data_samples = np.load('./books.npy') #use tf (raw count) features for LDA: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) for i in range(20): # fit the lda model: lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) #print("\nTopics in LDA model:") #tf_feature_names = tf_vectorizer.get_feature_names() #print_top_words(lda, tf_feature_names, n_top_words) score = lda.score(tf) theRecord.append(score) print("Log likelihood: ", score, "with ", n_components, "topics") #we'd like to maximise this print("-->Perplexity: ", lda.perplexity(tf)) #we'd like to minimise this n_components += 1 best = np.argmax(theRecord) + 1 print("The best number of topics to use is ", best) print("\nTopics in the best LDA model:") lda = LatentDirichletAllocation(n_components=best, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf)
mrHeader = next(csvreader) # extracting each data row one by one for row in csvreader: data.append(row) #sentiments = [["name","content","number of positive words", # "number of negative words","positive sentiment", "Negative sentiment", # "comment","like","retweet","url"]] for row in data: content.append((row[1])) numPosWords.append(float(row[2])) numNegWords.append(float(row[3])) senti_polarity_pos.append(float(row[4])) senti_polarity_neg.append(float(row[5])) comment.append(float(row[6])) likes.append(float(row[7])) retweet.append(float(row[8])) count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words="english") doc_term_matrix = count_vect.fit_transform(content) LDA = LatentDirichletAllocation(n_components=5, random_state=42) LDA.fit(doc_term_matrix) # pyLDAvis.enable_notebook() # pyLDAvis.sklearn.prepare(LDA, doc_term_matrix, count_vect, mds='tsne') for i, topic in enumerate(LDA.components_): print("Top 10 words for topic#", i) print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]]) print("\n")
CV = lda_cv.fit_transform(file_list) lda_columns = lda_cv.get_feature_names() df_corpus_lda = pd.DataFrame(CV.toarray(), columns=lda_columns) df_corpus_lda.head() # In[7]: no_topics = 5 max_iterations = 10 learn_off = 50 random = 0 lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=max_iterations, learning_method='online', learning_offset=learn_off, random_state=random) #lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online') LDA_DH_Model = lda_model.fit_transform(CV) # In[8]: print(LDA_DH_Model.shape) # docs | topics print(LDA_DH_Model[0]) # In[9]: print_topics(lda_model, lda_cv) # In[10]:
import numpy as np from time import time # Data from preprocessing.read_ap import sparse_docs as W_tr from preprocessing.dictionary import dictionary as dic, \ inverse_dictionary as inv_dic, terms def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % topic_idx message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) print(message) print() # Model from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=100, max_iter=20, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(W_tr) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") print_top_words(lda, terms, 20)
def extract_bonus(text): """This method extracts the LDA and LSA features, saving them to a file. Then, the method returns the LDA features for use in classification, since LDA is empirically shown to beat LSA in general. :param text: text to be featurized using LDA or LSA (full comment) :param infile: infile to write to (as string) for learnings from this bonus :return: all LDA features as matrix """ # we keep words with their tag to see if a different tagged word has different topic etc. for use_LDA in [False, True]: if use_LDA: featurizer = CountVectorizer(stop_words='english') else: featurizer = TfidfVectorizer(sublinear_tf=True, stop_words='english') data, labels, = zip(*[(c['body'], c['cat']) for c in text]) new_data = [] for comment in data: row = " ".join( [word[:word.rfind('/')] for word in comment.split(' ')]) # sentence is now just lemmatized, tokenized words separated by spaces, # as required by sklearn Coutnvectorizer. new_data.append(row) data = new_data labels = [files[lbl][1][0, -1] for lbl in labels] # transform to integer data = featurizer.fit_transform(data) n_components = 100 if use_LDA: topic_modeller = LatentDirichletAllocation( n_components=n_components, batch_size=100, random_state=2) else: topic_modeller = TruncatedSVD(n_components=n_components, n_iter=1, random_state=2) data = topic_modeller.fit_transform(data) labels = np.array(labels)[:, np.newaxis] data = np.concatenate([data, labels], axis=1) with open('a1_bonus_lda.txt', 'w' if not use_LDA else 'a') as outf: if use_LDA: topic_distribution = topic_modeller.components_ / topic_modeller.components_.sum( axis=1)[:, np.newaxis] for i in range(n_components): top_10_indices = np.argpartition(topic_distribution[i], -10)[-10:] top_10_words = np.array( featurizer.get_feature_names())[top_10_indices] top_10_probs = topic_distribution[i, top_10_indices] outf.write( f'topic {i} is best described by the 10 words: {top_10_words} with probabilities: {top_10_probs}\n' ) else: outf.write( f"explained variance from total variance is: {topic_modeller.explained_variance_ratio_.sum()}\n" ) with open('a1_bonus_lda.txt', 'a') as outf: outf.write( 'We see that topic 0 are likely positive adverbs, similar to what we saw with our Kbest feature selection. ' 'Topic 1 is difficult to describe, but could be described as how pro-life or thoughtful the text is.' 'Topic 2 makes a lot of sense, and is related to the religion and country of origin.' 'Since I printed all 100 topics, I will show a select few more.' 'Topic 97 and 99 both both related to censorship, which makes sense as they are related to generally to right-ist views.' 'Topic 98 relates to compassion and sharing, which could be tied with left-ist views.' 'As we can see, the topics selected by the LDA generally do correspond with generally ideas associated with the different views of ' 'different political texts. These topics enable us to reduce the number of dimensions while keeping the important information.' 'We could use these topics to perform unsupervised learning on the different clusters of text to understand if there are any class-imbalances ' 'in our data, which might result from mis-labelled datapoints (given that our dataset was poorly labelled (texts in a Left channel might have rightist views and just be argueing there).' ) return data
tfidf_vectorizer = TfidfVectorizer(use_idf=True) # just send in all your docs here tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(df_u_clean) # get the first vector out (for the first document) first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0] # place tf-idf values in a pandas data frame df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) df.sort_values(by=["tfidf"], ascending=False) lda_model = LatentDirichletAllocation(n_components=20, # Number of topics learning_method='online', random_state=0, n_jobs=-1 # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20) # print(topic_keywords[1]) # learn tfidf using TfidfVectorizer from sklean tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,5), stop_words='english', # ngram_range=(1,6) analyzer = 'word', min_df = 3, # minimum required occurences of a word # min_df = 3 lowercase = True, # convert all words to lowercase token_pattern = '[a-zA-Z0-9]{3,}', # num chars > 3
type = "tf" n_features = 10 for i in range(0, 10): if type == "tfidf": vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') else: vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') vecs = vectorizer.fit_transform(data["abstract"].tolist()) lda = LatentDirichletAllocation(learning_method="batch").fit(vecs) vectorizers.append(vectorizer) ldas.append(lda) [ print_top_words(lda, vectorizer.get_feature_names(), n_features) for lda, vectorizer in zip(ldas, vectorizers) ] words = [[ set(d) for d in get_top_words(lda, vectorizer.get_feature_names(), n_features) ] for lda, vectorizer in zip(ldas, vectorizers)] distances = np.eye(len(words)) intersections = np.eye(len(words), dtype=object)
con.row_factory = dict_factory cur = con.cursor() cur.execute("select * from headlines") results = cur.fetchall() #tf-idf the articles vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3)) X = vectorizer.fit_transform([*map(lambda x: x['text'], results)]) for item in X[0]: print(item) #print(vectorizer.get_feature_names()) svd = TruncatedSVD(n_components=100, n_iter=100) lda = LatentDirichletAllocation(n_components=10) L = lda.fit(X) S = svd.fit(X) #normalizer = Normalizer(copy=False) #lsa = make_pipeline(svd, normalizer) #X = lsa.fit_transform(X) terms = vectorizer.get_feature_names() for i, comp in enumerate(S.components_): termsInComp = zip(terms, comp) sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True)[:20] print("Concept %d:" % i) for term in sortedTerms: print(term[0]) print(" ")
tree_repr = unidecode(line.strip()) if len(tree_repr) == 0: continue while tree_repr[-1] == ' ': tree_repr = tree_repr[:-1] test.append(tree_repr) pass vectorizer = CountVectorizer(stop_words=stop_words) data = vectorizer.fit_transform(test) start_time = time.time() lda = LatentDirichletAllocation(n_components=nb_topics, random_state=0) lda.fit(data) print('\nSklearn LDA exec time: ' + str(time.time() - start_time) + 's') #Print the nb_words_topic main words of each topic, for the Sklearn LDA implementation print("\nTopics found via Sklearn LDA: ") words = vectorizer.get_feature_names() for topic_idx, topic in enumerate( lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]): print("\nTopic " + str(topic_idx + 1) + ': ' + str(nb_words_topic) + ' most important words, with p(w|z):') topic_sorted = topic.argsort()[:-nb_words_topic - 1:-1] probas = np.sort(np.array(topic))[::-1] for i in range(len(topic_sorted)): print(words[topic_sorted[i]] + ', ' +