from gensim import corpora, models, similarities, utils #生成字典 dictionary = corpora.Dictionary(train_set) #去除极低频的杂质词 dictionary.filter_extremes(no_below=1, no_above=1, keep_n=None) #将词典保存下来,将语料也保存下来,语料转换成bow形式,方便后续使用 dictionary.save(output + "all.dic") corpus = [dictionary.doc2bow(text) for text in train_set] saveObject(output + "all.cps", corpus) #存储原始的数据 saveObject(output + "all.info", docinfos) #TF*IDF模型生成 #使用原始数据生成TFIDF模型 tfidfModel = models.TfidfModel(corpus) #通过TFIDF模型生成TFIDF向量 tfidfVectors = tfidfModel[corpus] #存储tfidfModel tfidfModel.save(output + "allTFIDF.mdl") indexTfidf = similarities.MatrixSimilarity(tfidfVectors) indexTfidf.save(output + "allTFIDF.idx") #LDA模型 lda = models.LdaModel(tfidfVectors, id2word=dictionary, num_topics=30) lda.save(output + "allLDA50Topic.mdl") corpus_lda = lda[tfidfVectors] indexLDA = similarities.MatrixSimilarity(corpus_lda) indexLDA.save(output + "allLDA50Topic.idx")
#pprint(texts) dictionary = corpora.Dictionary(texts) #dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference #print(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] print corpus ''' tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] for doc in corpus_tfidf: print(doc) ''' lda = models.LdaModel(corpus, id2word=dictionary, num_topics=50) corpus_lsi = lda[corpus] lda.print_topics(50) for doc in corpus_lsi: print("********************DOCUMENTS*****************", doc) prepared = pyLDAvis.prepare(**corpus_lsi.pyldavis_data()) pyLDAvis.display(prepared) #for i in range(0, lda.num_topics-1): # print lda.print_topic(i) #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly # print(doc) '''
def train_Various_LDA_Models(corpus_NounAdj, id2word_nounAdj, listOfTwitsAndUniqueWords, runRegardless, baseSavePath="/content/drive/MyDrive/Harvard HW/Course 2 - Final Project/omertest/", overrideTrainSettings=True): if type(overrideTrainSettings) == dict: # If empty, use combination below print('overrideTrainSettings, will use ONLY set values of 4 Topics + 10 Passes') numTopics = overrideTrainSettings['numTopics'] numOfPasses = overrideTrainSettings['numOfPasses'] else: numTopics = [8] numOfPasses = [2] print('Will test on %s topics, %s passes' % (numTopics, numOfPasses)) outputSavePath = os.path.join(baseSavePath, "LDA_Topic_Model_Output.csv") runRegardless = True if not os.path.exists(outputSavePath) or runRegardless: if not os.path.exists(baseSavePath): os.makedirs(baseSavePath) ldaResultOutput = {} runCount = 0 for top in numTopics: for passN in numOfPasses: runCount+=1 ldaModelTitle = '\nLDA_%s_Topics_%s_Passes - RunCount: %s' % (top, passN,runCount) print(ldaModelTitle) # Start the clock: start_time = time() ldaResultOutput[ldaModelTitle] = {'TopicNum': top, 'PassNum': passN} lda_nounAdj = gensin_models.LdaModel(corpus=corpus_NounAdj, num_topics=top, passes=passN, id2word=id2word_nounAdj,iterations=100) print("Created gensin_models.LdaModel") # perplexity Perplexity = lda_nounAdj.log_perplexity(corpus_NounAdj) # coherence score coherence_model = CoherenceModel(model=lda_nounAdj, texts=listOfTwitsAndUniqueWords.values(), dictionary=id2word_nounAdj, coherence='c_v',processes=1) ## processes must be 1 or else freeze issues print("Created CoherenceModel") try: coherence = coherence_model.get_coherence() except Exception as e: print("Exception when running coherence_model.get_coherence():") print(e) coherence=0 ldaResultOutput[ldaModelTitle]['TopicNum'] = top ldaResultOutput[ldaModelTitle]['PassNum'] = passN ldaResultOutput[ldaModelTitle]['Perplexity'] = Perplexity ldaResultOutput[ldaModelTitle]['Coherence'] = coherence print('Num of topics: %s | Num of passes: %s | Perplexity: %s | Coherence Score: %s' % ( top, passN, Perplexity, coherence)) timeInSeconds = time() - start_time print('Finshed training %s in %s seconds.\n' % (ldaModelTitle, int(timeInSeconds))) ldaResultOutput_df = pd.DataFrame(ldaResultOutput).T.sort_values(by=['Perplexity'], ascending=True) ldaResultOutput_df.to_csv(outputSavePath) return ldaResultOutput_df
def get_lda(self, num_topics=100): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict) docs_lda = model_lda[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda]) return docs_vecs
tokenized_text = [tokenize_only(text) for text in preprocess] #remove stop words texts = [[word for word in text if word not in stopwords] for text in tokenized_text] dictionary = corpora.Dictionary(texts) #remove extremes (similar to the min/max df step used when creating the tf-idf matrix) dictionary.filter_extremes(no_below=1, no_above=0.8) #convert the dictionary to a bag of words corpus for reference corpus = [dictionary.doc2bow(text) for text in texts] lda = models.LdaModel(corpus, num_topics=num_clusters, id2word=dictionary, update_every=5, chunksize=10000, passes=100) topics_matrix = lda.show_topics(formatted=False, num_words=20) for entry in topics_matrix: index = entry[0] words = entry[1] words.sort(key = lambda x:x[1],reverse=True) word = [x[0] for x in words] print(index,word[:5]) # #0 ['research', 'studi', 'water', 'use', 'found'] #1 ['wast', 'energi', 'use', 'recycl', 'solar']
''' sep设置分割词 由于csv默认以半角逗号为分割词,而改词恰好再停用词表中,因此会导致读取出错 所以解决办法是手动设置一个不存在的分割词 ''' stop = [' ', ''] + list(stop[0]) baddata[1] = baddata[0].apply(lambda s: s.split(" ")) baddata[2] = baddata[1].apply(lambda x: [i for i in x if i not in stop]) gooddata[1] = gooddata[0].apply(lambda s: s.split(" ")) gooddata[2] = gooddata[1].apply(lambda x: [i for i in x if i not in stop]) ''' 负面主题分析 ''' bad_dict = corpora.Dictionary(baddata[2]) bad_corpus = [bad_dict.doc2bow(i) for i in baddata[2]] bad_lda = models.LdaModel(bad_corpus, num_topics=3, id2word=bad_dict) for i in range(3): print(bad_lda.print_topic(i)) bad_lda.print_topic(i) print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") ''' 正面主题分析 ''' good_dict = corpora.Dictionary(gooddata[2]) good_corpus = [good_dict.doc2bow(i) for i in gooddata[2]] good_lda = models.LdaModel(good_corpus, num_topics=3, id2word=good_dict) for i in range(3): print(good_lda.print_topic(i)) good_lda.print_topic(i)
stem_text ] def preprocessing(corpus): for document in corpus: doc = strip_numeric(document) doc = remove_stopwords(doc) doc = strip_short(doc, 3) #doc = stem_text(doc) doc = strip_punctuation(doc) strip_tags(doc) yield gensim.utils.tokenize(doc, lower=True) texts = preprocessing(corpus) dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=1, keep_n=25000) doc_term_matrix = [ dictionary.doc2bow(tokens) for tokens in preprocessing(corpus) ] tfidf = models.TfidfModel(doc_term_matrix) corpus_tfidf = tfidf[doc_term_matrix] lda = models.LdaModel(corpus_tfidf, num_topics=10, id2word=dictionary) topics = lda.print_topics(num_words=25) for i in topics: print(i[0]) print(i[1])
from gensim import corpora, models, similarities from db2text import get_tweets from db2text import clean_txt tweets = get_tweets() clean_tweets = [] for tweet in tweets: clean_tweet = clean_txt(tweet) clean_tweets.append(clean_tweet) # 用文本构建Gensim字典 dictionary = corpora.Dictionary(clean_tweets) dic_keys = list(dictionary.key()) print(dic_keys[0]) # 将字典转化为词袋模型(bag of words)作为参考 corpus = [dictionary.doc2bow(tweet) for tweet in clean_tweets] lda = models.LdaModel(corpus, num_topic=10, id2word=dictionary, update_exery=5, chunksize=10000, passes=100) lda.show_topics() topics_matrix = lda.show_topics(formatted=False, num_words=20) topics_matrix = np.array(topics_matrix) topic_words = topics_matrix[:, :, 1] for i in topic_words: print([str(word) for word in i])
def train_lda_model_gensim(corpus, total_topics=5): norm_tokenized_corpus = normalize_corpus(corpus, lemmatize= False, tokenize=True) #normalize dictionary = corpora.Dictionary(norm_tokenized_corpus) #create a dictionary for your corpus corpus_bow = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] #create bag of words lda = models.LdaModel(corpus_bow, id2word = dictionary, iterations=1000, num_topics=total_topics) #define model return lda
time_taken = time2 - time1 print(time_taken) df_doc_term.tail() #%% # TOPIC MODELLING - COUNT VECTORIZER # 1. LDA , Genism time1 = time.time() # Convert sparse matrix of counts to a gensim corpus corpus = matutils.Sparse2Corpus(sparse_matrix) print(corpus) # Map matrix rows to words (tokens) id2word = dict((v, k) for k, v in word_vectorizer.vocabulary_.items()) len(id2word) # Create lda model (equivalent to "fit" in sklearn) lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5) # 10 most important words for each of the 3 topics lda.print_topics() # Transform the docs from word space to topic space lda_corpus = lda[corpus] # Store the doc topic vectors in a list for review lda_docs = [doc for doc in lda_corpus] # Find the document vectors in the topic space for the first 10 documents lda_docs[0:10] time2 = time.time() time_taken = time2 - time1 print(time_taken) #%% lda.print_topics()
RANDOM_STATE = 1 # Database and other resources DATABASE_PATH = config['paths']['database'] LDA_PATH = config['paths']['lda'] DICTIONARY_PATH = config['paths']['dictionary'] CORPUS_PATH = config['paths']['corpus'] # Execution content = Content(DATABASE_PATH) dictionary = corpora.Dictionary(content) # Remove words that appear less than 5 times and that are in more than in 80% documents dictionary.filter_extremes(no_below=5, no_above=0.8) corpus = [dictionary.doc2bow(text) for text in content] # LDA Model lda = models.LdaModel(corpus, id2word=dictionary, random_state=RANDOM_STATE, num_topics=NUM_TOPICS, passes=NUM_PASSES) # Save resources lda.save(LDA_PATH) with open(DICTIONARY_PATH, 'wb') as fp: pickle.dump(dictionary, fp) fp.close() with open(CORPUS_PATH, 'wb') as fp: pickle.dump(corpus, fp) fp.close()
def main(): corpus = {} with open('corpus_data/preprocessedf_corpus.json') as corpus: corpus = json.loads(corpus.read().encode('utf-8')) corpus_2 = defaultdict(str) for artist, songlist in corpus.items(): for song in songlist: lyrics = song['lyrics'].strip('\\') corpus_2[artist] += lyrics features = {} with open('corpus_data/artist_features.json') as features: features = json.loads(features.read()) finalcorpus = [] for artist, lyrics in corpus_2.items(): d = {} d['artist'] = artist d['lyrics'] = lyrics d['pos'] = features[artist]['pos_counts'] finalcorpus.append(d) df = pd.DataFrame(finalcorpus) # nltk.download('wordnet') from nltk.corpus import wordnet as wn def get_lemma(word): lemma = wn.morphy(word) if lemma is None: return word else: return lemma """TOPIC MODELING HOPEFULLY""" import re from nltk import word_tokenize from nltk.corpus import stopwords STOPWORDS = stopwords.words('english') PROFANITY = set() with open('corpus_data/rapsw.txt') as infile: infile = infile.read() infile = infile.split() for el in infile: PROFANITY.add(el) def clean_text(text, ar): tokenized_text = word_tokenize(text.lower()) tokenized_text = [token for token in tokenized_text if len(token) > 5] cleaned_text = [ t for t in tokenized_text if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t) ] if ar == 'sw': cleaned_text = [t for t in cleaned_text if t not in STOPWORDS] if ar == 'lm': cleaned_text = [get_lemma(token) for token in cleaned_text] if ar == 'rw': cleaned_text = [ token for token in cleaned_text if token not in PROFANITY ] return cleaned_text for index, row in df.iterrows(): row['lyrics'] = clean_text(row['lyrics'], sys.argv[1]) from gensim import models, corpora from gensim.corpora.dictionary import Dictionary from gensim.test.utils import common_corpus, common_texts, get_tmpfile all_lyrics = [] all_artists = [] for index, row in df.iterrows(): all_lyrics.append(row['lyrics']) all_artists.append(row['artist']) #common_dictionary = Dictionary(common_texts) #common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] #lda_model = models.LdaModel(common_corpus, num_topics=10) dictionary = corpora.Dictionary(all_lyrics) corpus = [dictionary.doc2bow(text) for text in all_lyrics] NUM_TOPICS = 25 lda_model = models.LdaModel(corpus=corpus, num_topics=25, id2word=dictionary, passes=20) topics = lda_model.print_topics(num_words=4) print('LDA Topics') for topic in topics: print(topic) lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) topics = lsi_model.print_topics(num_words=4) print('LSI TOPICS') for topic in topics: print(topic) from gensim import similarities text = "" with open(sys.argv[2]) as inf: inf = inf.read() text = inf bow = dictionary.doc2bow(clean_text(text, sys.argv[1])) lda_index = similarities.MatrixSimilarity(lda_model[corpus]) lsi_index = similarities.MatrixSimilarity(lsi_model[corpus]) # Let's perform some queries similarities = lda_index[lda_model[bow]] # Sort the similarities similarities = sorted(enumerate(similarities), key=lambda item: -item[1]) similaritiesLSI = lsi_index[lsi_model[bow]] similaritiesLSI = sorted(enumerate(similaritiesLSI), key=lambda item: -item[1]) # Top most similar documents: #print(similarities[:10]) # [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)] # Let's see what's the most similar document document_id, similarity = similarities[0] document_id2, similarityLSI = similaritiesLSI[0] # print(all_lyrics[document_id][:1000]) print("LDA : TOP 5 Similar ARTISTS") for el in similarities[:5]: print(all_artists[el[0]]) print('') print('LSI : Top 5 Similar Artists') for el in similaritiesLSI[:5]: print(all_artists[el[0]])
import jieba, os from gensim import corpora, models, similarities train_set = [] walk = os.walk('C:\\Users\\Sun Yutian\\Desktop\\test') for root, dirs, files in walk: for name in files: f = open(os.path.join(root, name), 'r') raw = f.read() word_list = list(jieba.cut(raw, cut_all=False, HMM=True)) train_set.append(word_list) dic = corpora.Dictionary(train_set) corpus = [dic.doc2bow(text) for text in train_set] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=10) corpus_lda = lda[corpus_tfidf] f = open('dat', 'r') raw = f.read() word_list = list(jieba.cut(raw, cut_all=False, HMM=True)) vec_bow = dic.doc2bow(word_list) vec_lda = lda[vec_bow] index = similarities.MatrixSimilarity(lda[corpus]) sims = index[vec_lda] print(list(enumerate(sims)))
def LDA_post(infile, outfile, topic = 14): docs = [] # f = open(infile, 'r') # line = f.readline() # while line: # docs.append(line.lower().split('\t')[1]) # line = f.readline() # f.close() with open(infile, 'r') as csvfile: spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"') header = next(spamreader) for row in spamreader: docs.append(row[1]) texts = [] widgets = [FormatLabel('Processed: %(value)d records (in: %(elapsed)s)')] pbar = ProgressBar(widgets = widgets) for doc in pbar((doc for doc in docs)): texts.append([word for word in wordProcBase.tokenize_tweet(doc) if word not in stopwords.words('english')]) # doc = wordProcBase.tokenize5(doc.decode('utf-8')) # texts.append([word for word in doc if word not in stopwords.words('english')]) pbar.finish() pprint.pprint(texts) return # create a Gensim dictionary form the texts dictionary = corpora.Dictionary(texts) # remove extrems dictionary.filter_extremes(no_below = 1, no_above = 0.85) # convert the dictionary to a bag of words corpus for reference corpus = [dictionary.doc2bow(text) for text in texts] print ('Applying LDA...') lda = models.LdaModel(corpus, num_topics = topic, id2word = dictionary, update_every = 1, chunksize = 10000, passes = 100, minimum_probability = 0.001) topics = lda.show_topics(num_topics = topic, num_words = 5) # pprint.pprint(lda.print_topics(num_topics = topic)) # pprint.pprint(topics) print ('Writing results into file...') # 結果寫入文件 with open(outfile, 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"') top_prob = lda.get_document_topics(corpus) #a list of (topic_id, topic_probability) 2-tuples index = 1 for prob in top_prob: string = [0 for i in range(topic)] prob = sorted(prob, key = operator.itemgetter(0), reverse = False) for i, p in prob: string[i] = p spamwriter.writerow(string) index += 1 return ''' # reading unseen data ''' print ('Reading unseen data...') unseen = _MAIN_DIR_ + "/Data/VA_Proc/emtion_tweets/survey/google_survey_data.csv" docs = [] with open(unseen, 'r') as csvfile: spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"') for row in spamreader: docs.append(row[1]) texts = [] for doc in docs: texts.append([word for word in wordProcBase.tokenize3(doc.decode('utf-8')) if word not in stopwords.words('english')]) dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below = 1, no_above = 0.85) corpus = [dictionary.doc2bow(text) for text in texts] with open(outfile, 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"') top_prob = lda.get_document_topics(corpus) index = 1 for prob in top_prob: string = [index] for i in xrange(0, len(prob)): string.append(prob[i][1]) spamwriter.writerow(string) index += 1
caselist,idlist = getDataFromMongo(col1) print("out mongo") print("calist lenth: %d" % len(caselist)) print('build dictionary') dictionary = corpora.Dictionary(caselist) #dictionary.save('lda.dct') dict_len = len(dictionary) # transform the whole texts to sparse vector corpus = [dictionary.doc2bow(case) for case in caselist] print(len(corpus)) print('build lda') num_topics = 6 # create a transformation, from tf-idf model to lda model lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, alpha=0.01, eta=0.01, minimum_probability=0.001, update_every = 1, chunksize = 100, passes = 1) print('out lda') #lda.save('lda.model') doc_topics = lda.get_document_topics(corpus) dislist = getDis(doc_topics, num_topics) print(len(idlist), len(dislist)) write2mongo(col2, idlist, dislist) # num_show_term = 10 # 每个主题下显示几个词 # for topic_id in range(num_topics): # logging.info('第%d个主题的词与概率如下:\t' % topic_id) # term_distribute_all = lda.get_topic_terms(topicid=topic_id)
def construct_lda_sim_graph(corpus, args): """ compute lda vector similarity between paragraphs :param corpus: :param args: :return: """ sim_graph = [] raw_corpus = [' '.join(para) for para in corpus] # create English stop words list stoplist = set(stopwords.words('english')) # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # Lowercase each document, split it by white space and filter out stopwords texts = [[word for word in para.lower().split() if word not in stoplist] for para in raw_corpus] # Create a set of frequent words frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # stem each word processed_corpus = [[p_stemmer.stem(token) for token in text] for text in texts] dictionary = corpora.Dictionary(processed_corpus) if len(dictionary) < args.num_topics: return None bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] # train the model if args.find_opt_num: lda = get_optimal_ldamodel_by_coherence_values(corpus=bow_corpus, texts=processed_corpus, dictionary=dictionary) else: lda = models.LdaModel(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, alpha='auto', eta='auto', eval_every=None, minimum_probability=0.0) # LdaMulticore(bow_corpus, id2word=dictionary, num_topics=args.num_topics, eta='auto', # eval_every=None, minimum_probability=0.0) corpus_lda = lda[ bow_corpus] # create a double wrapper over the original corpus: bow->lda index = similarities.MatrixSimilarity(corpus_lda, num_features=len(dictionary)) print("corpus_lda[0]: %s" % str(corpus_lda[0])) total = 0. count_large = 0. for i in range(len(corpus_lda)): sim = index[corpus_lda[i]] assert len(sim) == len(corpus_lda), "the lda sim is not correct!" sim_graph.append(sim) for s in sim: total += 1 if s > args.sim_threshold: count_large += 1 print("sim_graph[0]: %s" % str(sim_graph[0])) return sim_graph, count_large, total
print(sentences) ############################################################################################################################ print('***********************') ################################输出这段文本(包含所有句子列表)的主题词################## film_dict = corpora.Dictionary(sentences) i = 0 for w in film_dict.values(): i += 1 print(i, w) film_corpus = [film_dict.doc2bow(i) for i in sentences] print(film_corpus) # film_lda = models.LdaModel(film_corpus, num_topics=3, id2word=film_dict) for i in range(3): print(film_lda.print_topic(i)) #输出每个主题 ######################################################################################## ####################################利用Word2Vec模型进行词向量化计算,计算词汇相关程度############################################################################# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #sentences = word2vec.Text8Corpus(u"E:\word.txt",encoding= 'utf-8') w = [] for i in range(100): model = gensim.models.Word2Vec(sentences, size=200) # 训练skip-gram模型; 默认window=5 print(model.similarity(u'美国', u'经济')) # 计算两个词的相似度/相关程度
def lda(export_perplexity=False): np.set_printoptions(linewidth=300) data = pd.read_csv('QQ_chat_result.csv', header=0, encoding='utf-8') texts = [] for info in data['Info']: texts.append(info.split(' ')) M = len(texts) print('文档数目:%d个' % M) # pprint(texts) print('正在建立词典 --') dictionary = corpora.Dictionary(texts) V = len(dictionary) print('正在计算文本向量 --') corpus = [dictionary.doc2bow(text) for text in texts] print('正在计算文档TF-IDF --') t_start = time.time() corpus_tfidf = models.TfidfModel(corpus)[corpus] print('建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start)) print('LDA模型拟合推断 --') num_topics = 20 t_start = time.time() lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha=0.001, eta=0.02, minimum_probability=0, update_every=1, chunksize=1000, passes=20) print('LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start)) if export_perplexity: export_perplexity1(corpus_tfidf, dictionary, corpus) # export_perplexity2(corpus_tfidf, dictionary, corpus) # # 所有文档的主题 # doc_topic = [a for a in lda[corpus_tfidf]] # print 'Document-Topic:\n' # pprint(doc_topic) num_show_term = 7 # 每个主题显示几个词 print('每个主题的词分布:') for topic_id in range(num_topics): print('主题#%d:\t' % topic_id, end=' ') term_distribute_all = lda.get_topic_terms(topicid=topic_id) term_distribute = term_distribute_all[:num_show_term] term_distribute = np.array(term_distribute) term_id = term_distribute[:, 0].astype(np.int) for t in term_id: print(dictionary.id2token[t], end=' ') print('\n概率:\t', term_distribute[:, 1]) # 随机打印某10个文档的主题 np.set_printoptions(linewidth=200, suppress=True) num_show_topic = 10 # 每个文档显示前几个主题 print('10个用户的主题分布:') doc_topics = lda.get_document_topics(corpus_tfidf) # 所有文档的主题分布 idx = np.arange(M) np.random.shuffle(idx) idx = idx[:10] for i in idx: topic = np.array(doc_topics[i]) topic_distribute = np.array(topic[:, 1]) # print topic_distribute topic_idx = topic_distribute.argsort()[:-num_show_topic - 1:-1] print(('第%d个用户的前%d个主题:' % (i, num_show_topic)), topic_idx) print(topic_distribute[topic_idx]) # 显示着10个文档的主题 mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(8, 7), facecolor='w') for i, k in enumerate(idx): ax = plt.subplot(5, 2, i + 1) topic = np.array(doc_topics[i]) topic_distribute = np.array(topic[:, 1]) ax.stem(topic_distribute, linefmt='g-', markerfmt='ro') ax.set_xlim(-1, num_topics + 1) ax.set_ylim(0, 1) ax.set_ylabel("概率") ax.set_title("用户 {}".format(k)) plt.grid(b=True, axis='both', ls=':', color='#606060') plt.xlabel("主题", fontsize=13) plt.suptitle('用户的主题分布', fontsize=15) plt.tight_layout(1, rect=(0, 0, 1, 0.95)) plt.show() # 计算各个主题的强度 print('\n各个主题的强度:\n') topic_all = np.zeros(num_topics) doc_topics = lda.get_document_topics(corpus_tfidf) # 所有文档的主题分布 for i in np.arange(M): # 遍历所有文档 topic = np.array(doc_topics[i]) topic_distribute = np.array(topic[:, 1]) topic_all += topic_distribute topic_all /= M # 平均 idx = topic_all.argsort() topic_sort = topic_all[idx] print(topic_sort) plt.figure(facecolor='w') plt.stem(topic_sort, linefmt='g-', markerfmt='ro') plt.xticks(np.arange(idx.size), idx) plt.xlabel("主题", fontsize=13) plt.ylabel("主题出现概率", fontsize=13) plt.title('主题强度', fontsize=15) plt.grid(b=True, axis='both', ls=':', color='#606060') plt.show()
def cal_sim_by_lda(tf_idf, dictionary, corpus): print('正在通过LDA模型计算文档之间的相似度......') lda = models.LdaModel(tf_idf, id2word=dictionary, num_topics=5) corpus_lda = lda[corpus] index = similarities.MatrixSimilarity(corpus_lda) return index
seg = seg_file(contents[0], stopwords) # 建立字典 dictionary = corpora.Dictionary(seg) V = len(dictionary) print(V) # 统计文档词频矩阵 text = [dictionary.doc2bow(text, allow_update=True) for text in seg] #print(text[0])#稀疏矩阵 #计算Tfidf矩阵 text_tfidf = models.TfidfModel(text)[text] #建立LDA模型,输出前十个主题 lda = models.LdaModel(text_tfidf, id2word=dictionary, num_topics=200, iterations=100) #显示主题 for k, v in lda.print_topics(num_topics=10): print(k, v) #所有文档的主题 doc_topic = lda.get_document_topics(text_tfidf) print(len(doc_topic)) for dt in doc_topic: print(dt) d = dict(dt) ret = sorted(d.items(), key=lambda x: x[1], reverse=True)[0] print(ret[0]) for k, v in lda.print_topics(num_topics=200): if k == ret[0]:
import numpy as np import os import xlwt file = open('C:\\Users\\Administrator\\Desktop\\original_lda\\ddata.txt', 'r') #courses =[]; #for line in file: # courses.append(line.strip().split(' '))#将文本中的字符依据空格一字字分开,变成list courses = [line.strip().split() for line in file] dic = corpora.Dictionary(courses) #为每个单词分配一个id corpus = [dic.doc2bow(text) for text in courses] #把文档doc变成一个稀疏矩阵 corpus_tfidf = models.TfidfModel(corpus)[corpus] numoftopics = 10 lda = models.LdaModel(corpus_tfidf, id2word=dic, alpha=0.01, eta=0.05, iterations=2000, num_topics=numoftopics, minimum_probability=0.0001) #文件j表示主题-词分布 f = open( 'C:\\Users\\Administrator\\Desktop\\original_lda\\ml_period5topic25a0.01j.txt', 'w') for topic_id in range(numoftopics): f.write('*Topic:' + str(topic_id)) f.write(str(lda.show_topic(topic_id))) f.write('\n') #文件f用来表示每个文档和各个主题的分布 fenbu = open( 'C:\\Users\\Administrator\\Desktop\\original_lda\\ml_period4qtopic20a0.01f.txt', 'w')
from gensim import corpora, models d = [['想', '买辆', '汽车'], ['我', '买辆', '汽车', '汽车', '喜欢']] dictionary = corpora.Dictionary(d) dictionary.save('./gensim.dict') corpora.Dictionary.load('./gensim.dict') print(dictionary) corpus = [dictionary.doc2bow(text) for text in d] # bag of words print(corpus) lda = models.LdaModel(corpus=corpus, num_topics=2, id2word=dictionary) print(lda.print_topics(2)) print(lda[corpus[0]])
def gensim_preprocess(train, test, model_type='lsi', num_topics=500, use_own_tfidf=False, force_compute=False, report_progress=False, data_dir='data/', **tfidf_params): """Use topic modeling to create a dense matrix representation of the input text. Notes ----- In many cases I create artifacts (corpora, tfidf representations etc.) for: 1. Training set 2. Test set 3. Their concatenation The concatenation is used to create the models, i.e compute embeddings since the labels are not needed in this unsupervised stage. The first two are used for the training and evaluation/submission stages accordingly. Parameters ---------- :param train: The training set as a pd.Dataframe including the free text column "comment_text". :param test: The test set as a pd.Dataframe including the free text column "comment_text". :param model: Dimensionality reduction model to be used, can be 'lsi', 'lda' for now (more might be added later). :param num_topics: Number of columns (features) in the output matrices. :use_own_tfidf: If true, our own version of tfidf will be used with **tfidf_params passed to it :force_compute: If True we will not even try to load but instead compute everything. Set it if you want to try different parameters. :report_progress: If True, progress will be reported when each computationally expensive step is starting. :data_dir: Path to the base data directory. Used to call this method from anywhere. For example a notebook would provide `data_dir='../data'` :**tfidf_params: Key-Value parameters passed to our own `tf_idf` implementation. Only used if `use_own_tfidf` is set to True. Returns ------- :return: (train, test) datasets as 2D np.ndarrays of shape (num_comments, `num_topics`) """ # Folder where gensim models and data will be saved to and loaded from. gensim_dir = data_dir + 'gensim/' def progress(msg): """Helper to conditionally print progress messages to std:out.""" if report_progress: print(msg) if force_compute: progress( "This is gonna take a while mate, grab a coffee/beer. Actually you might wanna take a walk as well. Or a nap :D" ) train_text = train["comment_text"].tolist() test_text = test["comment_text"].tolist() # Tokenize def safe_tokenize(comment): """Wrap `nltk.word_tokenize` but also handle corrupted input.""" try: return nltk.word_tokenize(comment) except TypeError: return ["UNKNOWN"] progress("Tokenizing text, this will take a while...") train_texts = [safe_tokenize(comment) for comment in train_text] test_texts = [safe_tokenize(comment) for comment in test_text] dictionary = corpora.Dictionary(train_texts + test_texts) # Lets create the TF-IDF representation needed for the dimensionality reduction models. if use_own_tfidf: # Untested yet but I hope it works. I mean, why wouldn't it right? progress("Using our own version of TF-IDF, this will take a while...") train_tfidf, test_tfidf, whole_tfidf = tf_idf(train, test, **tfidf_params) else: # Use gensims TFIDF model - Tested while under the influence of 10 beers. # I code well when drunk though so no worries. # Read or create the corpus try: # Hack to redirect to the exception handler - yes I know its bad but I like it mmmkay? if force_compute: raise FileNotFoundError train_corpus = corpora.MmCorpus(gensim_dir + 'training_corpus.mm') test_corpus = corpora.MmCorpus(gensim_dir + 'test_corpus.mm') whole_corpus = corpora.MmCorpus(gensim_dir + 'whole_corpus') except FileNotFoundError: progress("Creating the gensim corpora, this will take a while...") train_corpus = [ dictionary.doc2bow(comment) for comment in train_texts ] test_corpus = [ dictionary.doc2bow(comment) for comment in test_texts ] whole_corpus = [ dictionary.doc2bow(comment) for comment in train_texts + test_texts ] corpora.MmCorpus.serialize(gensim_dir + 'training_corpus.mm', train_corpus) corpora.MmCorpus.serialize(gensim_dir + 'test_corpus.mm', test_corpus) corpora.MmCorpus.serialize(gensim_dir + 'whole_corpus.mm', whole_corpus) progress( "Using gensim's implementation of TF-IDF, this will take a while..." ) tfidf_model = models.TfidfModel(whole_corpus) train_tfidf = tfidf_model[train_corpus] test_tfidf = tfidf_model[test_corpus] whole_tfidf = tfidf_model[train_corpus + test_corpus] # Feed the TF-IDF representation to the dimensionality reduction model - this is slow so try to load it first. if model_type == 'lsi': try: # Hack to redirect to the exception handler - yes I know its bad but I like it mmmkay? if force_compute: raise FileNotFoundError model = models.LsiModel.load(gensim_dir + 'lsi.model') except FileNotFoundError: progress("Creating the LSI model, this will take a while...") model = models.LsiModel(whole_tfidf, id2word=dictionary, num_topics=num_topics) model.save(gensim_dir + 'lsi.model') elif model_type == 'lda': try: # Hack to redirect to the exception handler - yes I know its bad but I like it mmmkay? if force_compute: raise FileNotFoundError model = models.LdaModel.load('data/lda.model') except FileNotFoundError: progress("Creating the LDA model, this will take a while...") model = models.LdaModel(whole_tfidf, id2word=dictionary, num_topics=num_topics) model.save(gensim_dir + 'lda.model') else: raise ValueError( "Only 'lda' and 'lsi' models are supported, you passed {}".format( model_type)) train = model[train_tfidf] test = model[test_tfidf] # Transform into a 2D array format. print("Reformatting output to a 2D array, this will take a while...") values = np.vectorize(lambda x: x[1]) return values(np.array(train)), values(np.array(test))
hclda = AgglomerativeClustering( n_clusters=5, affinity = 'euclidean', linkage = 'ward') y_hclda = hclda.fit_predict(Xlda_cluster) '''pyLDAvis''' texts = [[word for word in document.lower().split() if word not in stop_words] for document in corpus] all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10,passes=10) pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda,corpus,dictionary) vis '''PCA plot''' #kmeansbowPCA from sklearn.decomposition import PCA bow_pca = PCA(n_components=2) BowComponents = bow_pca.fit_transform(X_bow) BowDf = pd.DataFrame(data = BowComponents, columns = ['bow component 1', 'bow component 2']) bow_centers = bow_pca.transform(modelkmeansbow.cluster_centers_)
def train_lda(self): lda = models.LdaModel(self.corpus_tfidf, id2word=self.dictionary, num_topics=self.num_topics) return lda
# # tfidf = models.TfidfModel(corpus, id2word=dictionary, dictionary=dictionary, normalize=True) # tfidf.save(settings.TF_IDF_MODEL) # query = 'oil and gas' # from src.engine.preprocess import preprocess_body_lda # query = preprocess_body_lda(query) # corpus_query = [dictionary.doc2bow(query.split(" "))] # transformed = tfidf[corpus_query] # # logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True) # logentropy.save(settings.LOGENTROPY_MODEL) # logentropy_query = logentropy[transformed] lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=30, passes=3, alpha='auto', chunksize=4000) lsi.save(settings.LDA_MODEL) lsi = models.LdaModel.load(settings.LDA_MODEL) from gensim.similarities import MatrixSimilarity similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100) similarity_matrix.save(settings.SIMILARITY_MATRIX) # similarities = similarity_matrix.get_similarities(lsi[logentropy_query]) # # #
print '\nLSI Model:' lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary) topic_result = [a for a in lsi[corpus_tfidf]] pprint(topic_result) print 'LSI Topics:' pprint(lsi.print_topics(num_topics=2, num_words=5)) similarity = similarities.MatrixSimilarity( lsi[corpus_tfidf]) # similarities.Similarity() print 'Similarity:' pprint(list(similarity)) print '\nLDA Model:' num_topics = 2 lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha='auto', eta='auto', minimum_probability=0.001) doc_topic = [doc_t for doc_t in lda[corpus_tfidf]] print 'Document-Topic:\n' pprint(doc_topic) for doc_topic in lda.get_document_topics(corpus_tfidf): print doc_topic for topic_id in range(num_topics): print 'Topic', topic_id # pprint(lda.get_topic_terms(topicid=topic_id)) pprint(lda.show_topic(topic_id)) similarity = similarities.MatrixSimilarity(lda[corpus_tfidf]) print 'Similarity:' pprint(list(similarity))
tokens = vanilla_tokenize(texts) chunks100 = vanilla_chunk(tokens, 100) prune98 = vanilla_prune(chunks100, 98, 0) lemmanoun = vanilla_lemmatizer(prune98) ## train LDA model from gensim import corpora, models # bag-of-words dictionary = corpora.Dictionary(lemmanoun) corpus = [dictionary.doc2bow(chunk) for chunk in lemmanoun] # for reproducibility fixed_seed = 1234 np.random.seed(fixed_seed) # train model on k topics k = 20 mdl = models.LdaModel(corpus, id2word=dictionary, num_topics=k, chunksize=3125, passes=25, update_every=0, alpha=None, eta=None, decay=0.5, distributed=False) # print topics for i in range(0, k): print 'Topic', i + 1 print(mdl.show_topic(i)) print('-----')
from gensim import corpora, models np.set_printoptions(threshold=np.nan) numSemantics = 4 genre = input("Enter Genre: ") mat = svd.genSVDMatrix(genre) if (len(mat) < numSemantics or len(mat[0]) < numSemantics): print("cant report top semantics") else: svdSem = svd.svdCalc(mat, numSemantics) pcaSem = svd.svdCalc(np.matmul(np.transpose(mat), mat), numSemantics) allTags = db.getAllTags() print("\n\nSVD Decomposed top semantics:") for sem in svdSem: print("\n\n", utils.rankSem(sem, allTags)) print("\n\nPCA Decomposed top semantics:") for sem in pcaSem: print("\n\n", utils.rankSem(sem, allTags)) X = lda1.ldaInputTags(genre) dictionary = corpora.Dictionary(X) #print(dictionary) #print(dictionary.token2id) corpus = [dictionary.doc2bow(x) for x in X] #print(corpus) ldamodel = models.LdaModel(corpus, id2word=dictionary, num_topics=numSemantics) ldaSems = ldamodel.print_topics(num_topics=-1, num_words=len(dictionary)) print("\n\nLDA Decomposed top semantics:") for sem in ldaSems: print("\n\n", sem) print("\nThe above ids are Tag IDs")
for text in data: tokenized_data.append(clean_text(text)) # Build a Dictionary - association word to numeric id dictionary = corpora.Dictionary(tokenized_data) # Transform the collection of texts to a numerical form corpus = [dictionary.doc2bow(text) for text in tokenized_data] # Have a look at how the 20th document looks like: [(word_id, count), ...] print(corpus[20]) # [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), ... # Build the LDA model lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) # Build the LSI model lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) print("LDA Model:") for idx in range(NUM_TOPICS): # Print the first 10 most representative topics print("Topic #%s:" % idx, lda_model.print_topic(idx, 10)) print("=" * 20)