def partial_fit(self, X): """Train model over a potentially incomplete set of documents. Uses the parameters set in the constructor. This method can be used in two ways: * On an unfitted model in which case the model is initialized and trained on `X`. * On an already fitted model in which case the model is **updated** by `X`. Parameters ---------- X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns ------- :class:`~gensim.sklearn_api.hdp.HdpTransformer` The trained model. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) if self.gensim_model is None: self.gensim_model = models.HdpModel( id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state ) self.gensim_model.update(corpus=X) return self
def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.HdpModel """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(X) else: corpus = X self.gensim_model = models.HdpModel(corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) return self
def main_new_dataset(): newData = pd.read_csv( '../xsense_data/global_dataset_abs_speed_diff_yaw.txt', sep=';') newDataToWord = newData.ix[:, [ 'Acc_X', 'Acc_Y', 'Speed_X', 'Speed_Y', 'Diff_Yaw' ]] worder = WordData(newDataToWord) words = worder.create_words(worder.dataset) colWords = pd.Series(words, name='Word') wordDataset = pd.concat([newData, colWords], axis=1) #wordDataset.to_csv('../xsense_data/word_global_dataset.txt',sep=';') docs = worder.create_text_corpus(wordDataset) texts = [[i for i in doc.lower().split()] for doc in docs] dictionary = corpora.Dictionary(texts) dictionary.save('data_topic_modeling/new_dataset/doc_dictionary.dict') # corpus = corpora.TextCorpus(docs) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('data_topic_modeling/new_dataset/documents.mm', corpus) hdp = models.HdpModel(corpus, dictionary, T=50, K=10) print hdp.show_topics(topics=20, topn=5) topicDocs = hdp[corpus] for x in topicDocs: print x
def query_similarity(queries, corpus, method='tfidf', n_neighbors=2): dictionary, corpusdic = build_corpusdic(corpus) if method == 'lsi': mdl = models.LsiModel(corpusdic, id2word=dictionary, num_topics=100) elif method == 'tfidf': mdl = models.TfidfModel(corpusdic) elif method == 'rp': mdl = models.RpModel(corpusdic, num_topics=100) elif method == 'hdp': mdl = models.HdpModel(corpusdic, id2word=dictionary) elif method == 'lda': mdl = models.LdaModel(corpusdic, id2word=dictionary, num_topics=100) elif method == 'lem': mdl = models.LogEntropyModel(corpusdic) elif method == 'norm': mdl = models.NormModel(corpus, norm='l2') else: raise ValueError("There is an invalid model method in the input!") index = similarities.MatrixSimilarity(mdl[corpusdic]) indx_list = [] sim_list = [] for query in queries: vec_bow = dictionary.doc2bow(query.lower().split()) vec_lsi = mdl[vec_bow] # convert the query to LSI space sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) sims = sims[:n_neighbors] indx_, sim_ = np.array(sims).transpose() indx_list.append(indx_) sim_list.append(sim_) return indx_list, sim_list
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns ------- :class:`~gensim.sklearn_api.hdp.HdpTransformer` The trained model. """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) else: corpus = X self.gensim_model = models.HdpModel( corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state ) return self
def HDP(self, print_params=True, save_model=True, save_dir='saved_models', filename='', **kwargs): ''' Estimate a 'good' number of topics to set, based on the data ''' hdp_model = models.HdpModel(self.bow, id2word=self.gensim_dict, **kwargs) print('Inferring number of topics with Hierarchical Dirichlet Process...\n') if print_params: print('Parameters used in model:') print('TFIDF transformation: {}\n'.format(self.tfidf)) if save_model: if len(filename) == 0: filename = 'HDP_Params_TFIDF{}_'.format(self.tfidf) full_path = save_folder_file(save_dir, filename, ext='.model', optional_folder='HDP') hdp_model.save(full_path) print('Saving HDP model to: \n{}\n'.format(full_path)) return hdp_model
def create_model(corpus_path, output_path, num_topics=500, tfidf=False, lda=False, lsi=False, hdp=False): """Creates a model(s) specify by the parameters and save to output directory Parameters: corpus_path: the path to the corpus directory (os.path) output_path: the directory path where model(s) will be saved (os.path) tfidf=False: True if want a tfidf model created (boolean) lda=False: True if want a lda model created (boolean) lsi=False: True if want a lsi model created (boolean) """ mc = MathCorpus(corpus_path) mc.save_dictionary(os.path.join(output_path, "corpus.dict")) corpora.MmCorpus.serialize(os.path.join(output_path, "corpus.mm"), mc) tfidf_model = models.TfidfModel(mc) if tfidf: tfidf_model.save(os.path.join(output_path, "model.tfidf")) if lda: lda_model = models.LdaModel(mc, id2word=mc.dictionary, num_topics=num_topics) lda_model.save(os.path.join(output_path, "model.lda")) if lsi: lsi_model = models.LsiModel(tfidf_model[mc], id2word=mc.dictionary, num_topics=num_topics) lsi_model.save(os.path.join(output_path, "model.lsi")) if hdp: hdi_model = models.HdpModel(mc, id2word=mc.dictionary) hdi_model.save(os.path.join(output_path, "model.hdp"))
def partial_fit(self, X): """ Train model over X. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(X) if self.gensim_model is None: self.gensim_model = models.HdpModel(id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) self.gensim_model.update(corpus=X) return self
def get_hdp(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_hdp = models.HdpModel(docs_corpus, id2word=self.docs_dict) docs_hdp = model_hdp[docs_corpus] docs_vecs = np.vstack( [sparse2full(c, len(self.docs_dict)) for c in docs_hdp]) return docs_vecs
def fit(self, training, training_info): # store training sets self.training = training self.training_info = training_info print("creating train tokens") train_tokens = training_info["tokens"].apply( lambda tokens: tokens.split(" ")).values.tolist() print("creating train dict") train_my_dict = dictionary.Dictionary(train_tokens) print("creating train corpus") train_corpus = [train_my_dict.doc2bow(token) for token in train_tokens] print("training Hdp model") if os.path.isfile('temp/model.hdp') and self.use_pretrained_model: self.hdp = models.HdpModel.load('temp/model.hdp') else: self.hdp = models.HdpModel(train_corpus, id2word=train_my_dict) self.hdp.save('temp/model.hdp') print("creating train Hdp matrix") self.hdp_train_matrix = np.array( [self.hdp[document] for document in train_corpus]) self.address_books = create_address_books(training, training_info) self.mids_sender_recipient = create_dictionary_mids( training, training_info)
def gensim_feature(corpus=None): # corpus参数样例数据如下: corpus = [["我", "来到", "成都", "春熙路"], ["今天", "在", "宽窄巷子", "耍", "了", "一天"], ["成都", "整体", "来说", "还是", "挺", "安逸", "的"], ["成都", "的", "美食", "真", "巴适", "惨", "了"]] dictionary = corpora.Dictionary(corpus) # 构建语料词典 # # 收集停用词和仅出现一次的词的id # stop_ids = [dictionary.token2id[stopword] for stopword in user_stop_word_list if stopword in dictionary.token2id] # once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] # dictionary.filter_tokens(stop_ids + once_ids) # 删除停用词和仅出现一次的词 # dictionary.compactify() # 消除id序列在删除词后产生的不连续的缺口 # dictionary.save('mycorpus.dict') # 把字典保存起来,方便以后使用 # 统计词频特征 dfs = dictionary.dfs # 词频词典 for key_id, c in dfs.items(): print(dictionary[key_id], c) # 转换成doc_bow doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus] # 生成tfidf特征 tfidf_model = models.TfidfModel(dictionary=dictionary) # 生成tfidf模型 tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus] # 将每doc_bow转换成对应的tfidf_doc向量 # 生成lsi特征(潜在语义索引) lsi_model = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # 生成lsi model # 生成corpus of lsi lsi_corpus = [lsi_model[tfidf_doc] for tfidf_doc in tfidf_corpus] # 转换成lsi向量 # 生成lda特征(主题模型) lda_model = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # 生成lda model # 生成corpus of lsi lda_corpus = [lda_model[tfidf_doc] for tfidf_doc in tfidf_corpus] # 转换成lda向量 # 生成随机映射(Random Projections,RP, 优点:减小空维度、CPU和内存都很友好) rp_model = models.RpModel(tfidf_corpus, num_topics=500) rp_corpus = [rp_model[tfidf_doc] for tfidf_doc in tfidf_corpus] # 转换成随机映射tfidf向量 # 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP ,一种无参数贝叶斯方法) hdp_model = models.HdpModel(doc_bow_corpus, id2word=dictionary) hdp_corpus = [hdp_model[doc_bow] for doc_bow in doc_bow_corpus] # 转换成HDP向量 # 文档向量和词向量 (Doc2Vec and Word2Vec) tld_list = [] for ind, line_list in enumerate(corpus): tld_list.append(TaggedDocument(line_list, tags=[str(ind)])) d2v_model = Doc2Vec(tld_list, min_count=2, window=3, size=100, sample=1e-3, negative=5,iter=15) # 由于Doc2vec的训练过程也可以同时训练Word2vec,所以可以直接获取两个模型,全部保存起来: # model.save(save_model_d2v_file_path) # model.save_word2vec_format(save_model_w2v_file_path, binary=True) # 将文本转换成向量矩阵 docvecs = d2v_model.docvecs docvecs_matrix = np.asarray(docvecs) print(docvecs_matrix.shape)
def get_model(dictionary, corpus): """ Method returns trained topic-modelling model (Hierarchical Dierichlet Process). It requires gensim objects (dictionary and corpus) :param dictionary: gensim object :param corpus: gensim object :return: model object """ return models.HdpModel(corpus, id2word=dictionary)
def train(self): """ Train the HDP model :param n_passes: number of training passes :param update_every: training batch size :return: trained model """ hdp_model = models.HdpModel(self.corpus, id2word=self.dictionary) return hdp_model
def init_HDP(self, tf_idf='No'): if tf_idf == 'Yes': corpus, BOW_user_queries = self.init_tfidf() else: corpus, BOW_user_queries = self.get_corpus() HDP = models.HdpModel(corpus, id2word=self.dictionary) #print(LDA.show_topics()) corpus_HDP = HDP[corpus] HDP_user_queries = HDP[BOW_user_queries] return corpus_HDP, HDP_user_queries
def create_model(settings, model_type, bow_corpus, dictionary): print(f"Training {model_type} model. This may take several minutes depending on the size of the corpus.") model = None if model_type == 'LDA': model = models.LdaModel(bow_corpus, num_topics=settings['numberTopics'], id2word=dictionary, minimum_probability=settings['minimumProbability']) elif model_type == 'HDP': model = models.HdpModel(bow_corpus, dictionary) else: print('Invalid model') return save_model(settings['datasetName'], model, model_type) return model
def main(): newData = pd.read_csv('../xsense_data/global_dataset.txt', sep=';') ###############################LONG WORD TRY ############################### ############################### 15 SIGNALS ############################### ## Choose feature to represent in words ## All exclused altitude ## dataPartOne = newData.ix[:,'Acc_X':'Pitch'] ## dataPartTwo = newData.ix[:, 'Speed_X':'Speed_Z'] ## newDataToWord = pd.concat([dataPartOne,dataPartTwo], axis=1) ###############################REDUCED WORD TRY ############################### ############################### 5 SIGNALS ############################### newDataToWord = newData.ix[:, ['Acc_X', 'Acc_Y', 'Acc_Z', 'Speed_X', 'Roll']] worder = WordData(newDataToWord) words = worder.create_words(worder.dataset) colWords = pd.Series(words, name='Word') wordDataset = pd.concat([newData, colWords], axis=1) wordDataset.to_csv('../xsense_data/word_global_dataset.txt', sep=';') docs = worder.create_text_corpus(wordDataset) #docs = ['aaabacdb abababdb addbaedb daecabdb badbccdb', # 'aeaaacdb abebabdb acdbaedc dbecadda addbbccb', # 'aeaaacdb abebabdb acdbaedc dbecadda addbbccb'] texts = [[i for i in doc.lower().split()] for doc in docs] dictionary = corpora.Dictionary(texts) dictionary.save('data_topic_modeling/doc_dictionary.dict') # corpus = corpora.TextCorpus(docs) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('data_topic_modeling/documents.mm', corpus) hdp = models.HdpModel(corpus, dictionary, T=50, K=10) print hdp.show_topics(topics=20, topn=5) topicDocs = hdp[corpus] for x in topicDocs: print x alpha, beta = hdp.hdp_to_lda() print alpha lda_model = models.LdaModel(id2word=hdp.id2word, num_topics=len(alpha), alpha=alpha, eta=hdp.m_eta) lda_model.expElogbeta = np.array(beta, dtype=np.float32) print lda_model.show_topic(1)
def HDP(self, **config): gamma = config['gamma'] kappa = config['kappa'] tau = config['tau'] K = config['K'] T = config['T'] eta = config['eta'] self.model = models.HdpModel(self._dictionary.corpus, id2word=self._dictionary, gamma=gamma, kappa=kappa, tau=tau, K=K, T=T, eta=eta)
def transform(self, model='lda', ntopics=1, num_passes=1): #tfidf = models.TfidfModel(corpus=self.bag_of_word) #corpus_tfidf = tfidf[self.bag_of_word] ##return corpus_tfidf if model == 'lda': return models.LdaModel(self.bag_of_word, num_topics=ntopics, id2word=self.__dictionary, passes=num_passes, chunksize=10000, update_every=0, distributed=True) elif model == 'hdp': return models.HdpModel(self.bag_of_word, id2word=self.__dictionary)
def calculate_topic_distribution(): manage_nyt_dataset.topicsecription.remove({}) #dictionary = corpora.Dictionary.load('tmp/dictionary.dict') #corp = corpora.BleiCorpus('tmp/corpus_nyt.hdp-c') corpus, dictionary = create_dictionary() hdpmodel = models.HdpModel(corpus, id2word=dictionary) hdpmodel_corp = hdpmodel[corpus] create_topic_document_distribution(hdpmodel_corp) for k in hdpmodel.show_topics(topn=10, topics=-1, formatted=False): manage_nyt_dataset.topicsecription.insert_one({ 'topic': k[0], 'tuple_terms': k[1] }) return
def buildCorpus(): from gensim import corpora, models, similarities import logging from getDocSparseVector import getDocumentCorpus, cleanAndTokenize import cPickle as pickle directory = "/Users/Larry/Code/EpistemicAssistant/sampleWordDocs/" #Imports a set of comparison documents and tokenizes them #Should not need to rebuild the corpus at each request... documents = getDocumentCorpus(directory) #Get document objects texts = [] for doc in documents: texts.append(doc.tokenizedText) documentDictionary = corpora.Dictionary(texts) corpus = [documentDictionary.doc2bow(text) for text in texts] #Computes the HDA/nonparametric topic models if 'hdp' in locals(): print 'HDP already built. Using existing model' else: hdp = models.HdpModel(corpus, id2word=documentDictionary) pickle.dump( corpus, open( "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/corpus.p", "wb")) #Save corpus pickle.dump( documentDictionary, open( "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/documentDictionary.p", "wb")) #Save documentDictionary pickle.dump( hdp, open( "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/hdp.p", "wb")) #Save Hdp pickle.dump( documents, open( "/Users/Larry/Code/EpistemicAssistant/relevanceComputations/documents.p", "wb")) #Save documents
def LDA_LSI_hda_code(texts): dictionary = corpora.Dictionary(texts) print (dictionary) V = len(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] corpus_tfidf = models.TfidfModel(corpus)[corpus] corpus_tfidf = corpus print ('TF-IDF:') for c in corpus_tfidf: print (c) print ('\nLSI Model:') lsi = models.LsiModel(corpus_tfidf, num_topics=20, id2word=dictionary) topic_result = [a for a in lsi[corpus_tfidf]] pprint(topic_result) print ('LSI Topics:') pprint(lsi.print_topics(num_topics=20, num_words=10)) similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # similarities.Similarity() print ('Similarity:') pprint(list(similarity)) print ('\nLDA Model:') num_topics = 2 lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha='auto', eta='auto', minimum_probability=0.001, passes=10) doc_topic = [doc_t for doc_t in lda[corpus_tfidf]] print ('Document-Topic:\n') pprint(doc_topic) for doc_topic in lda.get_document_topics(corpus_tfidf): print (doc_topic) for topic_id in range(num_topics): print ('Topic', topic_id) # pprint(lda.get_topic_terms(topicid=topic_id)) pprint(lda.show_topic(topic_id)) similarity = similarities.MatrixSimilarity(lda[corpus_tfidf]) print ('Similarity:') pprint(list(similarity)) hda = models.HdpModel(corpus_tfidf, id2word=dictionary) topic_result = [a for a in hda[corpus_tfidf]] print ('\n\nUSE WITH CARE--\nHDA Model:') pprint(topic_result) print ('HDA Topics:') print (hda.print_topics(num_topics=20, num_words=10))
def startNLP(modelType): #This builds the corpus and the model, etc. It is also possible to use these things prebuilt from gensim import corpora, models import logging from getDocSparseVector import getDocumentCorpus #Declare globals #global documents, corpus, documentDictionary, hdp #reload(getDocSparseVector) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #Use heirarchical dirichlet allocation topic modeling from gensim to compute the relevance between documents directory = "/Users/Larry/Code/EpistemicAssistant/sampleWordDocs/" #Imports a set of comparison documents and tokenizes them #Should not need to rebuild the corpus at each request... documents = getDocumentCorpus(directory) #Get document objects texts = [] for doc in documents: texts.append(doc.tokenizedText) documentDictionary = corpora.Dictionary(texts) corpus = [documentDictionary.doc2bow(text) for text in texts] #Computes the HDA/nonparametric topic models if modelType == 'hdp': currentModel = models.HdpModel(corpus, id2word=documentDictionary) elif modelType == 'tfidf': #hdp = models.HdpModel(corpus, id2word=documentDictionary) currentModel = models.TfidfModel(corpus, id2word=documentDictionary) elif modelType == 'lda': currentModel = models.LdaModel( corpus, id2word=documentDictionary, num_topics=200) #Should try to fugure out a good number of topics else: print(currentModel + ' not yet supported') return { 'documents': documents, 'corpus': corpus, "documentDictionary": documentDictionary, "currentModel": currentModel }
def gensim_Corpus(corpus=None): dictionary = corpora.Dictionary(corpus) # 1 doc_bow转化为tfidf向量 doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus] tfidf_model = models.TfidfModel(dictionary=dictionary) tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus] print('doc_bow转换成对应的tfidf_doc向量:\n', tfidf_corpus) # 2 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP ,一种无参数贝叶斯方法) hdp_model = models.HdpModel(doc_bow_corpus, id2word=dictionary) hdp_corpus = [hdp_model[doc_bow] for doc_bow in doc_bow_corpus] # 转换成HDP向量 print('HDP :\n', hdp_corpus) # 3 将RP模型存储到磁盘上 savepath = r'../dataSet/files/hdp_model.pkl' hdp_file = open(savepath, 'wb') pkl.dump(hdp_model, hdp_file) hdp_file.close() print('--- HDP模型已经生成 ---')
def topics(documents, dictionary, strategy='lda', num_topics=3, iterations=50, passes=1, **kwargs): """ Strategies and best practices are: "lsi" - latent semantic indexing. Documents = tfidf_corpus. Num is 200-500 topics. "lda" - latent dirichlet analyisis. Documents = corpus. Num is expert driven. "rp" - Random projections. Documents = tfidf_corpus, Num is 100-10000 "hdp" - Hierarchical Dirichlet Process = corpus. Num is not used. """ if strategy == "lsi": model = models.LsiModel(documents, id2word=dictionary, num_topics=num_topics, iterations=iterations, passes=passes, **kwargs) if strategy == "lda": model = models.LdaModel(documents, id2word=dictionary, num_topics=num_topics, iterations=iterations, passes=passes, **kwargs) if strategy == "rp": model = models.RpModel(documents, num_topics=num_topics, iterations=iterations, passes=passes, **kwargs) if strategy == "hdp": model = models.HdpModel(documents, id2word=dictionary, **kwargs) results = model[documents] return model, results
def _create_models(self): print("Create models ...") if self._model_type == "lda": self._topic_model = models.LdaModel(self._corpus, id2word=self._wdict, num_topics=self._num_topics) elif self._model_type == "lsi": self._topic_model = models.LsiModel(self._corpus, id2word=self._wdict, num_topics=self._num_topics) elif self._model_type == "hdp": self._topic_model = models.HdpModel(self._corpus, id2word=self._wdict) elif self._model_type == "none": self._topic_model = NullModel(self._corpus, id2word=self._wdict) else: raise SyntaxError("Invalid model_type '%s'" % self._model_type) self._index = similarities.MatrixSimilarity( self._topic_model[self._corpus])
def train_hdp_model(corpus, num_topics, id2word): print('Training HDP model...') hdp_output = open('models/hdp.txt', 'w') hdp = models.HdpModel(corpus_tfidf, id2word=dictionary) topic_result = [a for a in hdp[corpus_tfidf]] print('HDP Model:', file=hdp_output) pprint(topic_result, stream=hdp_output) print('\nHDP Topics:', file=hdp_output) print(hdp.print_topics(num_topics=num_topics, num_words=5), file=hdp_output) print('Visualizing HDP similarity...') similarity = list(similarities.MatrixSimilarity(hdp[corpus_tfidf])) print('\nSimilarity:', file=hdp_output) pprint(similarity, stream=hdp_output) draw_graph(similarity, 0.99, 'visualization/hdp_similarity.png') return similarity
def build_model(dataset, num_topics=100, is_hdp=True): print("generating dictionary and corpus...") dic = corpora.Dictionary(dataset) dic.filter_extremes(no_below=2) # 去除低频词汇 corpus = [dic.doc2bow(text) for text in dataset] print("constructing LDA model...") if is_hdp: hdp = models.HdpModel(corpus, id2word=dic) (alpha, beta) = hdp.hdp_to_lda() model = models.LdaModel(id2word=hdp.id2word, num_topics=len(alpha), alpha=alpha, eta=hdp.m_eta) model.expElogbeta = np.array(beta, dtype=np.float32) num_topics = len(alpha) else: model = models.LdaMulticore(corpus, id2word=dic, num_topics=num_topics) print("saving model...") dic.save_as_text("topic_model/dic.txt") corpora.MmCorpus.serialize("topic_model/corpus.mm", corpus) model.save('topic_model/model.lda') return model, num_topics
def get_hdp(self): hdp = models.HdpModel(self.gs_corpus, self.gs_dict) hdp_topics = hdp.get_topics() hdp_df = pd.DataFrame(hdp_topics) hdp_dfn = pd.DataFrame(hdp_df.unstack()) hdp_dfn.reset_index(inplace=True) hdp_dfn.columns = ['token_id', 'topic_id', 'token_freq'] self.db.put_table(hdp_dfn, 'hdp', if_exists='replace') # todo: Go the next step and extract topic with word with freqs above a thresh thresh = 0.0005 # Sometimes it's easier to use SQL than to figure out how to something # like this in Pandas sql = """ SELECT topic_id, GROUP_CONCAT(token_str, ' ') AS top_words FROM ( SELECT topic_id, token_id FROM hdp WHERE token_freq > {} ORDER BY topic_id, token_freq DESC ) JOIN token USING (token_id) GROUP BY topic_id """.format(thresh) hdp_topics = pd.read_sql_query(sql, self.db.conn) self.db.put_table(hdp_topics, 'hdp_topics') thresh = 0.005 # Note this is different from what's in config.ini
def identify_topics(self, labels, texts, verbose=False): if verbose: print('\tStart identifying topics ...') s_time = datetime.now() self.label_codes = np.unique(labels) for idx, l_code in enumerate(self.label_codes): if l_code != -1: all_tweets_of_cluster = " ".join(texts[labels == l_code]) self.all_tweets_of_clusters.append(all_tweets_of_cluster) self.cleaned.append(all_tweets_of_cluster.split(' ')) dictionary = corpora.Dictionary(self.cleaned) self.corpus = [ dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned ] self._model = models.HdpModel(self.corpus, dictionary) self.num_topics = self._model.get_topics().shape[0] if self._model is not None: for i, topic in self._model.show_topics(formatted=True, num_topics=self.num_topics, num_words=10): self.topics.append(topic) for i, topic in self._model.show_topics(formatted=False, num_topics=self.num_topics, num_words=10): self.topics_not_formatted.append(topic) if len(self.topics) < 5: print() dur = datetime.now() - s_time if verbose: print('\tIdentifying topics was finished ({} seconds).'.format( dur.seconds)) pass
def get_hdp(*args, **kwargs): return models.HdpModel(*args, **kwargs)