def lda_model(self, num_topics: [int, None] = 10, passes: [int, None] = 1, seed: [int, None] = None): """ Construct LDA topic models for each year in a corpus, given a set of parameters. """ if self.word_to_id is None or self.corpora is None: self.build_dictionaries_and_corpora() results = num_dict(self.year_list) if seed is None: for year in self.year_list[:-1]: results[year] = \ LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year], num_topics=num_topics, passes=passes) else: rand = RandomState(seed) for year in self.year_list[:-1]: results[year] = \ LdaModel(corpus=self.corpora[year], id2word=self.word_to_id[year], num_topics=num_topics, passes=passes, random_state=rand) return TopicResults(results, self.num_docs, self.name)
def trainModel(self): if self.toweight: self.model = LdaModel(self.tfidf[self.corpus], num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]]) else: self.model = LdaModel(self.corpus, num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.corpus])
def create_models(df): ''' creates/saves two LDA models (one genre, one subgenre) in a folder called lda_models ''' df = get_all_genres() id2word = corpora.Dictionary(df.genres) word2id = {v: k for k, v in id2word.items()} corpus = [id2word.doc2bow(genres) for genres in df.genres] # captures subgenres with 50 categories subgenre_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=50, random_state=100, update_every=1, passes=5, alpha='auto', per_word_topics=True) # capture main genres with 10 categories genre_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10, random_state=100, update_every=1, passes=5, alpha='auto', per_word_topics=True) subgenre_model.save('lda_models/subgenre.model') genre_model.save('lda_models/genre.model')
def __trainingModel(self): if (self.seed != None): self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair, id2word=self.corpora.Dictionary, num_topics=self.numTopics, random_state=np.random.RandomState( self.seed)) else: self.ldaModel = LdaModel(corpus=self.corpora.TfidfPair, id2word=self.corpora.Dictionary, num_topics=self.numTopics)
def getCoherency(d, corp, topics=10, coherence='u-mass', varyTopics=False): m1 = LdaModel(corp, topics, d) cm = CoherenceModel(model=m1, corpus=corp, coherence='u_mass') if varyTopics: topics = range(5, 16) coherencies = [] for topic in topics: m = LdaModel(corp, topic, d) c = CoherenceModel(model=m, corpus=corp, coherence='u_mass') coherencies.append(c.get_coherence()) return np.max(coherencies) return cm.get_coherence()
def getLdaFeature(documents, topicNum): ''' Funciton: generate lda features by training lda model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lda features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lda model # LogInfo(' Train LDA model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] # ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) # generate lda features LogInfo(' Generate LDA features...') ldaFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpus_tfidf: topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01) for t in topic: ldaFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlda") ldaFeature = pd.DataFrame(ldaFeature, columns = colName) return ldaFeature
def coherence_values(self, limit, start=2, step=2, random_state=24, passes=20): coherence_values = [] model_list = [] for num_topics in range(start, limit, step): model = LdaModel(self.corpus, num_topics=num_topics, id2word=self.dictionary, random_state=random_state, passes=passes) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=self.doc_list, dictionary=self.dictionary, coherence="c_v") coherence_values.append(coherencemodel.get_coherence()) print("Model with # of topics", num_topics, "has coherence:", coherencemodel.get_coherence(), end="\r", flush=True) return model_list, coherence_values
def ldamodel(self, num_topics, random_state=24, passes=20): return LdaModel(self.corpus, num_topics=num_topics, id2word=self.dictionary, random_state=random_state, passes=passes)
def find_topic(self,condition=None,n_topics=10,n_words=10,topic_model='lda',vec_model='tf',show=True,**kwargs): '''主题模型,和上面那个函数,优先使用该函数 parameter --------- condition: 语料逻辑值,可以用于专门对好评/差评进行主题分解 n_topics: 主题数 n_words: 每个主题输出的词语数 vec_model: 向量化方法,默认是tf ''' if condition is not None: texts=self.texts_seg[condition] else: texts=self.texts_seg if topic_model in ['lda','LDA']: dictionary = corpora.Dictionary([doc.split(' ') for doc in texts]) corpus = [dictionary.doc2bow(text.split(' ')) for text in texts] if vec_model in ['idf','tfidf']: tfidf = models.TfidfModel(corpus) corpus = tfidf[corpus] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics) topics_keywords=lda.show_topics(num_topics=n_topics, num_words=n_words,formatted=False) if show: print('\n'.join(['主题 {}: {}'.format(i,' | '.join([k[0] for k in \ topic[1]])) for i,topic in enumerate(topics_keywords)])) return topics_keywords
def build_model(dictionary, corpus, n_topics, lemmatized_notes): # Build LDA model coh_val_lda = [] coh_val_lda_mallet = [] model_lda = [] model_mallet = [] for topic in n_topics: lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=topic, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) coh_lda_model = CoherenceModel(model=model_lda, texts=lemmatized_notes, dictionary=dictionary, coherence='c_v') coh_val_lda.append(coh_lda_model.get_coherence()) model_lda.append(lda_model) # Build LDA Mallet model mallet_path = 'mallet/bin/mallet' lda_mallet = LdaMallet(mallet_path, corpus=corpus, num_topics=n_topics, id2word=dictionary) coh_lda_model = CoherenceModel(model=lda_mallet, texts=lemmatized_notes, dictionary=dictionary, coherence='c_v') model_mallet.append(lda_mallet) coh_val_lda_mallet.append(coh_lda_model.get_coherence()) return model_mallet, coh_val_lda_mallet, model_lda, coh_val_lda
def fit_model(self, data, params, return_data=False): """ Fit model to `data` using gensim with parameter set `params`. """ from gensim.models.ldamodel import LdaModel dictionary = params.pop('dictionary', None) if hasattr(data, 'dtype') and hasattr(data, 'shape') and hasattr( data, 'transpose'): corpus = dtm_to_gensim_corpus(data) dtm = data else: if isinstance(data, tuple) and len(data) == 2: dictionary, corpus = data else: corpus = data dtm = gensim_corpus_to_dtm(corpus) model = LdaModel(corpus, id2word=dictionary, **params) if return_data: return model, (corpus, dtm) else: return model
def train_lda(n_topics, id2word_dictionary=None, documents=None, corpus=None): """ Training method for LDA. documents is a list of lists of words/tokens documents is used to construct a dictionary and a corpus from which the topics for LDA are inferred """ # Construct dictionary of words if it's not passed if not id2word_dictionary: id2word_dictionary = corpora.Dictionary(documents) word2idx_dictionary = dict([(w, idx) for (idx, w) in id2word_dictionary.items()]) # Construct corpus for model if documents and not corpus: corpus = [id2word_dictionary.doc2bow(document) for document in documents] # Cluster the documents into topics using LDA. number of topics is given # by n_topics lda_model = LdaModel(corpus=corpus, id2word=id2word_dictionary, num_topics=n_topics, update_every=1, chunksize=10000, passes=1) """ Default value for topn (number of top words to show by probability) is 10. A high enough value should return the words covering most or all of the probability mass """ topics = [lda_model.show_topic(idx, topn=50000) for idx in range(0, n_topics)] return lda_model, id2word_dictionary, word2idx_dictionary, topics
def makeLDA(path, num_topics, num_words, passes): num_topics = num_topics # 模型中寻找主题的数量 num_words = num_words # 从每个主题中看到多少单词 passes = passes # 重复检查数据多少次 with open(filename, encoding='utf-8') as f: documents = f.readlines() texts = [[ word for word in document.lower().split() if word not in STOPWORDS and word.isalnum() ] for document in documents] # print(texts) # 从单词列表中创建一个字典和一个语料库 dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, passes=passes) pp = pprint.PrettyPrinter(indent=4) pp.pprint(lda.print_topics(num_words=num_words)) unseennText = '../../../data/LDA_data/lkmlSingleNewEmail.txt' with open(unseennText, encoding='utf-8') as fenw: newdoc = fenw.read() newcourpus = dictionary.doc2bow( newword for newword in newdoc.lower().split() if newword not in STOPWORDS and newword.isalnum()) #将新的语料库传入现有的LDA模型 pp.pprint(lda[newcourpus])
def train(self, number_passes): mapping_word2id, document_terms_matrix = self.numerical_corpus model = LdaModel(document_terms_matrix, num_topics=self.number_topics, id2word=mapping_word2id, passes=number_passes) self.model = model
def trainModel(): """ Train a model """ if args.mode == 'Random': return args.topics, 0 # need to train on dump files = [ f"{args.input}/{f}" for f in os.listdir(args.input) if os.path.isfile(os.path.join(args.input, f)) ] if args.mode == 'LDA': # create dictionary with open(files[0], "r", encoding='utf-8') as f: dct = Dictionary([' '.join(f.readlines()).split()]) for filename in files[1:]: with open(filename, "r", encoding='utf-8') as f: dct.add_documents([' '.join(f.readlines()).split()]) # create corpus corpus = [] for filename in files: with open(filename, "r", encoding='utf-8') as f: corpus.append(dct.doc2bow(' '.join(f.readlines()).split())) lda = LdaModel(corpus, num_topics=args.topics) lda.save("./models/LDAdump.model") dct.save("./models/LDAdump.dct") return lda, dct if args.mode == 'loadLDA': return LdaModel.load("./models/LDAdump.model"), Dictionary.load( "./models/LDAdump.dct")
def create_LDA(comment_dict, num_topics=20, chunk_size=50, max_iter=20, from_db=True, get_data_func=None): lda = None text_gen = data_preprocessor(max_iter=max_iter, from_db=from_db, get_data_func=get_data_func) corpus = [] for _, stemmed_text, _ in text_gen: if len(stemmed_text) != 0: corpus.append(comment_dict.doc2bow(stemmed_text)) if len(corpus) == chunk_size: if lda is None: lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=comment_dict, per_word_topics=1, passes=10) else: lda.update(corpus=corpus) corpus = [] return lda
def train_lda(recipe_file,num_topics,output_file): corpus = RecipeCorpus(recipe_file) corpora.MmCorpus.serialize(output_file+'.corpus.mm', corpus) lda = LdaModel(corpus, id2word=corpus.dictionary, num_topics=int(num_topics), distributed=False) lda.save(output_file) return lda
def generate_docs_lda(self, dictionary_file_path, tfidf_file_path, lda_file_path, num_topics=100): """ 生成文档库lda主题文件 :param dictionary_file_path: :param tfidf_file_path: :param lda_file_path: :return: """ try: dictionary = corpora.Dictionary.load(dictionary_file_path) tfidf_corpus = corpora.MmCorpus(tfidf_file_path) lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20) with open(lda_file_path, 'wb') as f: pickle.dump(lda, f) logger.info('lda model file building finished') except Exception as e: logger.error('generate documents library lda file failed for %s' % str(e))
def compute_coherence_values(dictionary, corpus, texts, limit=40, start=2, step=6): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] for num_topics in range(start, limit, step): print(num_topics) model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) print(num_topics) model_list.append(model) print(num_topics) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') print(coherencemodel) coherence_values.append(coherencemodel.get_coherence()) print(num_topics) return model_list, coherence_values
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """Convert :class:`~gensim.models.wrappers.ldamallet.LdaMallet` to :class:`~gensim.models.ldamodel.LdaModel`. This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model. Parameters ---------- mallet_model : :class:`~gensim.models.wrappers.ldamallet.LdaMallet` Trained Mallet model gamma_threshold : float, optional To be used for inference in the new LdaModel. iterations : int, optional Number of iterations to be used for inference in the new LdaModel. Returns ------- :class:`~gensim.models.ldamodel.LdaModel` Gensim native LDA. """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, gamma_threshold=gamma_threshold, dtype=numpy. float64 # don't loose precision when converting from MALLET ) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim
def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. # `topics1` is clearly better as it has a clear distinction between system-human # interaction and graphs. Hence both the coherence measures for `topics1` should be # greater. self.topics1 = [['human', 'computer', 'system', 'interface'], ['graph', 'minors', 'trees', 'eps']] self.topics2 = [['user', 'graph', 'minors', 'system'], ['time', 'graph', 'survey', 'minors']] self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0) mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if self.mallet_path: self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0) vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) if not vw_path: msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" logging.info(msg) self.vw_path = None else: self.vw_path = vw_path self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0)
def vwmodel2ldamodel(vw_model, iterations=50): """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to :class:`~gensim.models.ldamodel.LdaModel`. This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. Parameters ---------- vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` Trained Vowpal Wabbit model. iterations : int Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`. Returns ------- :class:`~gensim.models.ldamodel.LdaModel`. Gensim native LDA. """ model_gensim = LdaModel(num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, dtype=numpy.float32) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim
def create_model(self, doc_matrix, term_dictionary, model_path, save_model=True, language='language_na'): """ Creates an LDA model based on a set of documents :param model_path: :param doc_matrix: :param term_dictionary: :param save_model: :param language: :return LDA model: """ self.language = language start = time() self.ldamodel = LdaModel(doc_matrix, num_topics=self.num_categories, id2word=term_dictionary, passes=50) if save_model: self.save_model(model_path=os.path.join( model_path, 'models', self.language, '%s_%s_category_lda.model' % (language, str(self.num_categories)))) logging.info('Training lasted: {:.2f}s'.format(time() - start)) return self.ldamodel
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): coherence_values = [] model_list = [] for num_topics in range(start, limit, step): print(f"Train {num_topics}") model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass') coherence_values.append(coherencemodel.get_coherence()) x = range(start, num_topics + step, step) coherence_pairs = (x, coherence_values) with open( os.path.join("/home/norpheo/Documents/thesis", "coherence_pair_umass.pickle"), "wb") as handle: pickle.dump(coherence_pairs, handle) return model_list, coherence_values
def fit_lda(X, vocab, num_topics=5, passes=20): """ Fit LDA from a scipy CSR matrix (X). """ print('fitting lda...') return LdaModel(matutils.Sparse2Corpus(X.T), num_topics=num_topics, passes=passes, id2word=dict([(i, s) for i, s in enumerate(vocab)]))
def fit_LdaModel(gensim_df, id2word, num_topics, alpha, passes=15, iterations=10000, update_every=1000, chunksize=1000, minimum_topic_probability=0.05, forget_weight=0.5, distributed=True): model = LdaModel( corpus=gensim_df, id2word=id2word, num_topics=num_topics, alpha=alpha, passes=passes, # epochs iterations=iterations, update_every=update_every, #batch size chunksize=chunksize, #batch size minimum_probability=minimum_topic_probability, decay=forget_weight, per_word_topics=True, distributed=distributed) return model
def lda_extractor(corpus, dictionary, num_topics=1): lda = LdaModel( corpus=corpus, id2word=dictionary, num_topics=num_topics, ) return lda
def topic_model(df_train, df_test, topic_count=10): ## general remove text df_train['tweet'] = df_train['tweet'].map(general_text_processing) df_test['tweet'] = df_test['tweet'].map(general_text_processing) ## remove stop words df_train['tweet'] = df_train['tweet'].map(remove_stop_words) df_test['tweet'] = df_test['tweet'].map(remove_stop_words) ## gensim lda from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel dictionary = Dictionary() for t in df_train.tweet.values.tolist(): #print(t) dictionary.add_documents([t.split()]) #for t in df_test['tweet'].values.tolist() : #print(t) # print(t[0].split()) #print(dictionary.doc2bow(t.split())) train_doc2_corupus = [ dictionary.doc2bow(text.split()) for text in df_train['tweet'].values.tolist() ] #print(train_doc2_corupus) lda_model = LdaModel(train_doc2_corupus, num_topics=topic_count) """ fill topics """ df_test = fill_lda_result(df_test, lda_model, dictionary, topic_count) df_train = fill_lda_result(df_train, lda_model, dictionary, topic_count) """ return """ return df_train, df_test
def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): """ Function to convert mallet model to gensim LdaModel. This works by copying the training model weights (alpha, beta...) from a trained mallet model into the gensim model. Args: mallet_model : Trained mallet model gamma_threshold : To be used for inference in the new LdaModel. iterations : number of iterations to be used for inference in the new LdaModel. Returns: model_gensim : LdaModel instance; copied gensim LdaModel """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, alpha=mallet_model.alpha, iterations=iterations, gamma_threshold=gamma_threshold, dtype=numpy. float64 # don't loose precision when converting from MALLET ) model_gensim.expElogbeta[:] = mallet_model.wordtopics return model_gensim
def vwmodel2ldamodel(vw_model, iterations=50): """ Function to convert vowpal wabbit model to gensim LdaModel. This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. Args: vw_model : Trained vowpal wabbit model. iterations : Number of iterations to be used for inference of the new LdaModel. Returns: model_gensim : LdaModel instance; copied gensim LdaModel. """ model_gensim = LdaModel(num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, dtype=numpy.float32) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim