def create_gensim_lsa_model(doc_clean, number_of_topics, words): dictionary, doc_term_matrix = prepare_corpus(doc_clean) lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) return (lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
def lsi(clean_docs, model_name, topics): from gensim import corpora # turn all data into a dictionary mappping of normalized words and their integer ids dictionary = corpora.Dictionary(clean_docs) # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples) # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus corpus = [] for doc in clean_docs: corpus.append(dictionary.doc2bow(doc)) # serialize version: save dictionary and corpus for future use from gensim.corpora import MmCorpus MmCorpus.serialize('corpus_' + model_name + '.mm', corpus) dictionary.save('dictionary_' + model_name + '.gensim') # Train LSI model from gensim.models import LsiModel num_topics = topics # find this number of topics in the data lsimodel = LsiModel(corpus, num_topics=num_topics, id2word=dictionary) lsimodel.save('lsi_model_' + model_name + '.gensim') topics = lsimodel.print_topics(num_words=5) for topic in topics: print(topic)
def create_gensim_lsa_model(doc_clean,number_of_topics,words): prepare_corpus = project2.initialize_terms_and_postings() dictionary,doc_term_matrix = prepare_corpus(doc_clean) lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary) print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)) return lsamodel
def run_on_time_period(start, stop): # create lists to hold data start_date = start stop_date = stop date_list = [] raw_docs = [] # run the data getter while start_date <= stop_date: timestamp = start_date.replace(tzinfo=timezone.utc).timestamp() doc = get_data(timestamp) raw_docs.append(doc) real_date = start_date - timedelta(days=1) date_list.append(real_date.date()) start_date += timedelta(days=1) # make list of docs without name for i in range(len(raw_docs)): for name in nicknames: if name in raw_docs[i]: raw_docs[i] = raw_docs[i].replace(name, '') final_docs = preprocess(raw_docs) dict, doc_term_matrix = create_corpus(final_docs) # lsi_models, coherence_values = get_coherence_values(dict, doc_term_matrix, final_docs, 10, 1, 2) lsi_model = LsiModel(doc_term_matrix, num_topics=10, id2word=dict) counter = 1 print(lsi_model.print_topics(num_topics=5, num_words=5))
def create_lsi(num_topic, dictionary): corpus, dic = generate_corpus(dictionary) print("__________________________Create LSI_________________________") lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dic) topics = lsimodel.print_topics(num_topic) # Showing only the top 5 topics # see list of topics for topic in topics: print(topic) return lsimodel
def lsi(all_tokens_lists): dictionary = corpora.Dictionary(all_tokens_lists) corpus = [dictionary.doc2bow(text) for text in all_tokens_lists] tfidf = models.TfidfModel(corpus, smartirs='ntc') tfidf_model = tfidf[corpus] lsi_model = LsiModel(corpus=tfidf_model, id2word=dictionary, num_topics=7, decay=0.5) pprint(lsi_model.print_topics(-1, 10))
def create_gensim_lsa_model(self,doc_clean,number_of_topics,words): """ Input : clean document, number of topics and number of words associated with each topic Purpose: create LSA model using gensim Output : return LSA model """ dictionary,doc_term_matrix=self.prepare_corpus(doc_clean) # generate LSA model lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary) # train model print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)) return lsamodel
def lsi_model(dictionary, corpus, corpus_tfidf, cluster_keyword_lsi): # 使用lsi模型,获取主题分布 lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20) f_keyword = open(cluster_keyword_lsi, 'w+',encoding='utf-8') for topic in lsi.print_topics(20, 20): print(topic[0]) words = [] for word in topic[1].split('+'): word = word.split('*')[1].replace(' ', '') words.append(word) f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n') return lsi
def run(self, input_file_path: str, output_file_path: str, num_topics: int) -> bool: """execute.""" # input input_df = pd.read_csv(input_file_path, header=None, dtype=np.float64).fillna(0.0) # get label column label_column_name = input_df.columns[0] label_df = pd.DataFrame(input_df.iloc[:, 0], columns=[label_column_name], dtype=np.int64) input_df = input_df.drop(label_column_name, axis=1) # convert to dictionaty input_df_column_names = input_df.columns.T.tolist() dict_values = input_df.T.to_dict().values() # make corpus corpus = [] for row in dict_values: tmp = [] for word, score in row.items(): tmp.append((input_df_column_names.index(word), score)) corpus.append(tmp) # lsi lsi_model = LsiModel(corpus, num_topics=num_topics) all_topics = lsi_model.print_topics(num_topics) # convert to list corpus_lsi = lsi_model[corpus] data = [] for doc in corpus_lsi: tmp = [] for i in range(len(all_topics)): try: tmp.append(doc[i][1]) except IndexError: tmp.append(0.0) data.append(tmp) # convert to dataframe lsi_df = pd.DataFrame(data, dtype=np.float64) # join svd and label column output_df = pd.concat([label_df, lsi_df], axis=1) # save output_df.to_csv(output_file_path, index=False, header=False) return True
def __create_lsa_model(self, clean_doc, num_topics, words): dictionary, corpus_tfidf = self.__prepare_corpus(clean_doc) # generate LSA model lsamodel = LsiModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary) print("Topics: ", lsamodel.print_topics(num_topics=num_topics, num_words=words)) return lsamodel
def generate_lsa_model(documents, n_topics): dictionary, document_term_matrix = prepare_corpus(documents) print('Generating the LSA model...') lsaModel = LsiModel(document_term_matrix, id2word=dictionary, num_topics=n_topics) print(lsaModel.print_topics(num_topics=n_topics, num_words=10)) print('Done!') return lsaModel
def lsi_model(sentence_dict,dictionary,corpus,corpus_tfidf,cluster_keyword_lsi, target_lt, num_cluster=11): '''使用lsi模型,获取主题分布''' lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_cluster) f_keyword = open(cluster_keyword_lsi, 'w+') for topic in lsi.print_topics(num_cluster, 50): words=[] for word in topic[1].split('+'): word=word.split('*')[1].replace(' ','') words.append(word) f_keyword.write(str(topic[0])+'\t'+','.join(words)+'\n') corpus_lsi = lsi[corpus_tfidf] write_results("./results_lsi.txt", corpus_lsi, target_lt) return lsi
def lsi_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lsi): ''' Obtain topic distribution by using LSI model :param sentence_dict: sentence dictionary :param dictionary: corpus dictionary :param corpus: corpus in document term matrix :param corpus_tfidf: TF-IDF model of corpus :param cluster_keyword_lsi: LSI method name :return: None ''' lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=10) f_keyword = open(cluster_keyword_lsi, 'w+') for topic in lsi.print_topics(num_topics=10, num_words=50): words = [] for word in topic[1].split('+'): word = word.split('*')[1].replace(' ', '') words.append(word) f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')
def LSAmodel(words, num_topics=5, num_words=5): dictionary = corpora.Dictionary(words) # Term Document Frequency corpus = [dictionary.doc2bow(word) for word in words] # save it! pickle.dump(corpus, open('corpus.pkl', 'wb')) dictionary.save('dictionary.gensim') # Train model lsimodel = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) # print_topics(num_topics=20, num_words=10) topics = lsimodel.print_topics(num_topics=num_topics, num_words=num_words) # Validation # A measure of how good the model is. lower the better. val_perplexity = lsimodel.log_perplexity(corpus) # cohherent score coherence_lsimodel = CoherenceModel(model=lsimodel, texts=words, dictionary=dictionary, coherence='c_v') val_coherence = coherence_lsimodel.get_coherence() return topics, val_perplexity, val_coherence
def create_gensim_lsa_model(doc_clean, number_of_topics, words): """ Input : clean document, number of topics and number of words associated with each topic Purpose: create LSA model using gensim Output : return LSA model """ dictionary, doc_term_matrix = prepare_corpus(doc_clean) # generate LSA model lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) # train model print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)) # temp coherencemodel = CoherenceModel(model=lsamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v') print(coherencemodel.get_coherence()) return lsamodel
def getLSATopics(self, doc, number_of_topics, chunk=2000, gram=(1, 2), option='c'): dictionary, doc_term_matrix = self.prepare_corpus(doc, gram, option) # generate LSA model lsa = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary, chunksize=chunk) # train model display(lsa.print_topics()) # Let's take a look at which topics each transcript contains corpus_transformed = lsa[doc_term_matrix] # transform the result into numpy array to get the score for each title all_topics_csr = matutils.corpus2csc(corpus_transformed) all_topics_numpy = all_topics_csr.T.toarray() #Lsa_Topic=pd.DataFrame(all_topics_numpy) Lsa_Topic = pd.DataFrame(all_topics_numpy, doc) display(Lsa_Topic.head(5)) print('shape ', Lsa_Topic.shape) return Lsa_Topic
num_topics = 2 + int(len(corpus) / 250) if num_topics >= 20: num_topics = 10 num_words = (num_topics - 2) * 2 + 10 print('本院系文章总数为%d,即将分为主题数%d个,关键字%d个......' % (len(corpus), num_topics, num_words)) # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=50) # result = ldamodel.print_topics(num_topics=num_topics, num_words=num_words) # doc_lda = ldamodel[corpus] model = LsiModel( corpus, id2word=dictionary, num_topics=num_topics, ) doc_lda = model[corpus] result = model.print_topics(num_topics=num_topics, num_words=num_words) time2 = time.time() print('模型训练用时:', time2 - time1) print('LDA模型训练完成。插入数据库......') for n in range(len(doc_lda)): Topic = doc_lda[n] if len(Topic) == 0: prams = (institution_paper_list[n][0], institution + "其他", json.dumps({}, ensure_ascii=False), json.dumps({}, ensure_ascii=False)) sql = 'insert into lda2 values(%s,%s,%s,%s)' list = dbs.exe_sql(sql, prams) continue c1 = sorted(Topic, key=lambda x: x[1], reverse=True)
def LSI(doc_term_matrix): # Running and Trainign LDA model on the document term matrix. lsimodel = LsiModel(doc_term_matrix, num_topics=25, id2word = dictionary, decay=0.5) lsimodel.save("lsimodel") pprint(lsimodel.print_topics(-1))
#corpus_lsi = lsi[corpus_tfidf] lsi = LsiModel(corpus, id2word=dictionary, num_topics=5) # initialize an LSI transformation corpus_lsi = lsi[corpus] #affection_lsi = tfidf[corpus[0]] #affection_lsi = lsi[tfidf[corpus[0]]] numb_lsi = lsi[corpus[3]] #print corpus_tfidf[0] #print corpus_lsi[0] #print affection_lsi index = similarities.MatrixSimilarity(lsi[corpus]) sims = index[numb_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) sims = sims[:5] for doc_id, sim in sims: print doc_id, document_song_mapping[doc_id], sim lsi.print_topics(5) #print affection_lsi #print lsi[corpus[15]] #print sims # #print index #sims = index[tfidf[corpus[0]]] #print(list(enumerate(sims))) #for word_id, weight in affection: # print inverted_dict[word_id], weight
start, stop, step = 0, 12, 1 for num_topics in range(start, stop, step): model = LsiModel(doc_term_matrix, num_topics=12, id2word=dictionary) # generate LSA model # train model model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) x = range(start, stop, step) plt.plot(x, coherence_values) plt.xlabel("Number of Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() number_of_topics = 3 words = 15 model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) topics = model.print_topics(num_topics=number_of_topics, num_words=words) print(model.print_topics(num_topics=number_of_topics, num_words=words))
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) timestamp = generate_timestamp() parser = argparse.ArgumentParser() parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt") parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm") parser.add_argument("-m", "--model", help="path to model output") args = parser.parse_args() # load id->word mapping (the dictionary) id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary)) # load corpus iterator mm = MmCorpus(args.corpus) print(mm) # MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries) # extract num_topics LSI topics; use the default one-pass algorithm num_topics = 400 model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics) # print the most contributing words (both positively and negatively) for each of the first ten topics model.print_topics(10) model.save("%s/%s.model" % (args.model, timestamp))
Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix Output : term dictionary and Document Term Matrix """ # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # generate LDA model return dictionary,doc_term_matrix #LSA - Topic Modelling ##Application du modèle sur le corpus LONGIT number_of_topics = 1 words = 100 document_list,titles = load_data("",'./corpus_files/prod_all_txt/corpus_longit.csv') clean_text = preprocess_data(document_list) dictionary,doc_term_matrix = prepare_corpus(clean_text) lsamodel = LsiModel(doc_term_matrix, num_topics = number_of_topics, id2word = dictionary) # train model print(lsamodel.print_topics(num_topics = number_of_topics, num_words = words)) output_file = open('./corpus_files/tm_csv/topic_modelling.csv', mode = 'w', encoding = 'utf8') output_file.write("Topic modelling du corpus LONGIT : "+str(lsamodel.print_topics(num_topics = number_of_topics, num_words = words))) output_file.close()
data_lemmatized = make_bigrams(data_words) id2word = corpora.Dictionary(data_lemmatized) texts = data_lemmatized corpus = [id2word.doc2bow(text) for text in texts] lda_model = LdaModel.load('lda_model_full2') for c in lda_model[corpus[5:8]]: print("Document Topics : ", c[0]) print("Word id, Topics : ", c[1][:3]) print("Phi Values (word id) : ", c[2][:2]) print("Word, Topics : ", [(dct[wd], topic) for wd, topic in c[1][:2]]) print("Phi Values (word) : ", [(dct[wd], topic) for wd, topic in c[2][:2]]) print("------------------------------------------------------\n") lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5) pprint(lsi_model.print_topics(-1)) def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data): sent_topics_df = pd.DataFrame() for i, row_list in enumerate(ldamodel[corpus]): row = row_list[0] if ldamodel.per_word_topics else row_list row = sorted(row, key=lambda x: (x[1]), reverse=True) for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append( pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break
bow_corpus = [dictionary.doc2bow(text) for text in texts] word_counts = [[(dictionary[id], count) for id, count in line] for line in bow_corpus] tfidf = models.TfidfModel(bow_corpus) doc = tfidf[bow_corpus[2]] corpus_tfidf = tfidf[bow_corpus] lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=250, decay=0.5) # initialize an LSI transformation corpus_lsi = lsi_model[corpus_tfidf] topics = lsi_model.print_topics(5) for topic in topics: print(topic) def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3): coherence_values = [] model_list = [] for num_topics in range(start, stop, step): # generate LSA model model = LsiModel(doc_term_matrix,
) # Dict.txt - file with clear review's text from future recycle mydict = corpora.Dictionary( simple_preprocess(line, deacc=True) for line in open('Dict.txt', encoding='utf-8')) #print(mydict) corpus = [ mydict.doc2bow(simple_preprocess(line)) for line in open('Dict.txt', encoding='utf-8') ] #print (corpus) lsamodel = LsiModel(corpus, num_topics=number_of_topics, id2word=mydict) # train model # print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)) tx = lsamodel.print_topics() pprint( tx, open("Topic.txt", "w") ) #Topic.txt - file with main thems(parametrs) from reviews, which have rating 5.0 # ----------------------------------create vectors with word's index and tf-idf recycling------------------------------ with open("Dict.txt", "r") as file: documents = file.read().splitlines() # print(documents) from sklearn.feature_extraction.text import CountVectorizer count_vectorizer = CountVectorizer() bag_of_words = count_vectorizer.fit_transform(documents) feature_names = count_vectorizer.get_feature_names() pprint(pd.DataFrame(bag_of_words.toarray(), columns=feature_names), open("matrix.txt", "w"))
from utils import generate_timestamp logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) timestamp = generate_timestamp() parser = argparse.ArgumentParser() parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt") parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm") parser.add_argument("-m", "--model", help="path to model output") args = parser.parse_args() # load id->word mapping (the dictionary) id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary)) # load corpus iterator mm = MmCorpus(args.corpus) print(mm) # MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries) # extract num_topics LSI topics; use the default one-pass algorithm num_topics = 400 model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics) # print the most contributing words (both positively and negatively) for each of the first ten topics model.print_topics(10) model.save("%s/%s.model" % (args.model, timestamp))
def lsi2(corpus_tfidf,dictionary): lsi_model = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=30, decay=0.5) for topic in lsi_model.print_topics(5,10): print(topic)
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p" logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT) # 读取字典和词袋 dictionary_en = corpora.Dictionary.load(r"Data\Intermediate\dict_en.dict") corpus_en = corpora.BleiCorpus(r"Data\Intermediate\corpus_en.blei") # tfidf模型 tfidf = TfidfModel(corpus_en) tfidf.save(r"Data\Output\tfidf_model_en") # 用tfidf模型计算tfidf的语料 tfidf_corpus = tfidf[corpus_en] corpora.BleiCorpus.serialize(r"Data\Intermediate\tfidf_corpus_en.blei", tfidf_corpus) # lsi lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary_en, num_topics=20) lsi_corpus = lsi[tfidf_corpus] lsi.save(r"Data\Output\lsi_model_en") corpora.BleiCorpus.serialize(r"Data\Intermediate\lsi_corpus_en.blei", lsi_corpus) lsi.print_topics() # lda lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary_en, num_topics=20) lda_corpus = lda[tfidf_corpus] lda.save(r"Data\Output\lda_model_en") corpora.BleiCorpus.serialize(r"Data\Intermediate\lda_corpus_en.blei", lda_corpus) lda.print_topics()
print("TOPIC (LSI) " + str(topic_id) + " : ", topic) print('#' * 50) print(lsi_model.num_topics) for i in range(0, lsi_model.num_topics - 1): if lsi_model.print_topic(i): print(lsi_model.print_topic(i)) corpus_tfidf = tfidf_model[corpus] corpus_lsi = lsi_model[corpus] lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300) corpus_lsi_2 = lsi_model_2[corpus] print('完成创建模型') print('*' * 10, lsi_model_2.print_topics(5)) topic_id = 0 for topic in lsi_model_2.show_topics(): print("TOPIC (LSI2) ", str(topic_id), " : ", topic) group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id][1] > 0.5] print(str(group_topic)) topic_id += 1 print("文档加工 " + str(lsi_model_2.docs_processed)) for doc in corpus_lsi_2: # 无论 bow->tfidf 还是 tfidf->lsi 实际上是在此运行 print("Doc " + str(doc)) #模型的保存 #corpus.dictionary.save("dictionary.dump")
dictionary = gensim.corpora.Dictionary(processed_docs) bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] document_num = 30 bow_doc_x = bow_corpus[document_num] print(bow_corpus[10]) # # for i in range(len(bow_doc_x)): # print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], # dictionary[bow_doc_x[i][0]], # bow_doc_x[i][1])) # # lsamodel = LsiModel(bow_corpus, num_topics=7, id2word=dictionary) # train model print(lsamodel.print_topics(num_topics=7, num_words=10)) for idx, topic in lsamodel.print_topics(-1): print("Topic: {} \nWords: {}".format(idx, topic )) print("\n") from gensim.test.utils import datapath # Save model to disk. temp_file = datapath("lsa_model_optimized") lsamodel.save(temp_file) # Load a potentially pretrained model from disk. df_test_jokes = pd.read_csv("JokeText.csv") if False: lsamodel = gensim.models.LsiModel.load(temp_file)