def run(self, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=0, *args, **kwargs): self.model = HdpModel(corpus=self.corpus, id2word=self.dictionary, kappa=kappa, tau=tau, K=K, T=T, alpha=alpha, gamma=gamma, eta=eta, scale=scale, var_converge=var_converge, outputdir=outputdir, random_state=random_state, *args, **kwargs) print("Done!\nCheckout hdp.model")
def getRelationDetailByHDP(sentence_list): # 聚类获取结果 corpus = [] pairs_all, position_all = segmentor.segListWithNerTag(sentence_list) words_list = [] for pairs in pairs_all: word_list = [] for pair in pairs: if pair.flag.__contains__("v") or pair.flag.__contains__("n"): word_list.append(pair.word) words_list.append(word_list) # words_list = list(map(lambda pairs: map(lambda x: x.word, pairs), pairs_all)) from gensim import corpora dictionary = corpora.Dictionary(words_list) for words in words_list: corpus.append(dictionary.doc2bow(words)) from gensim.models import HdpModel hdp = HdpModel(corpus, dictionary) a = hdp.print_topics() words = {} for topic in a: word_details = str(topic[1]).split(" + ") for word_detail in word_details: word = str(word_detail[word_detail.index("*") + 1:]) num = float(str(word_detail[:word_detail.index("*")])) if not (words.__contains__(word)): words[word] = num else: words[word] += num words = sorted(words.items(), key=lambda d: d[1]) return words # 后获取句法分析中的高频动词名词)
def model_pcs(self, model_name, all_mashup_num, all_api_num): # 按照0-all——num得到的其实是按真实id的映射!!! # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if model_name == 'HDP': self.model = HdpModel(self.mashup_dow + self.api_dow, self.dct) self.num_topics = self.model.get_topics().shape[0] elif model_name == 'TF_IDF': self.model = TfidfModel(self.mashup_dow + self.api_dow) self.num_topics = len(self.dct) else: raise ValueError('wrong gensim_model name!') mashup_hdp_features = [ self.model[mashup_info] for mashup_info in self.mashup_dow ] api_hdp_features = [self.model[api_info] for api_info in self.api_dow] self._mashup_hdp_features = np.zeros((all_mashup_num, self.num_topics)) self._api_hdp_features = np.zeros((all_api_num, self.num_topics)) for i in range(all_mashup_num): for index, value in mashup_hdp_features[i]: self._mashup_hdp_features[i][index] = value for i in range(all_api_num): for index, value in api_hdp_features[i]: self._api_hdp_features[i][index] = value return self._mashup_hdp_features, self._api_hdp_features
def build_lda_models(course_corpus, course_dictionary, mapping, course_texts): # ==== Train Unsupervised LDA ==== lda_model = LdaModel(corpus=course_corpus, id2word=course_dictionary) # ==== Train Unsupervised HDP-LDA ==== hdp_model = HdpModel(corpus=course_corpus, id2word=course_dictionary) # ==== Train Author Topic Model ==== author_to_doc = {} # author topic LDA (authors are modules,lessons,items) for author_type in ["modules", "lessons", "items"]: entity_to_doc = mapping[author_type] for entity_name, entity_docs in entity_to_doc.items(): author_to_doc["{}: {}".format(author_type[0].capitalize(), entity_name)] = entity_docs at_model = AuthorTopicModel(corpus=course_corpus, id2word=course_dictionary, author2doc=author_to_doc) # ==== Train Labeled LDA ==== # explicitly supervised, labeled LDA llda_alpha = 0.01 llda_beta = 0.001 llda_iterations = 50 llda_labels = [] llda_corpus = [] labelset = set() for course_text_id in range(0, len(course_texts)): doc_labels = [] # get module label name for module_name, doc_vec in mapping["modules"].items(): if course_text_id in doc_vec: doc_labels.append("M: {}".format(module_name)) break # get lesson label name for lesson_name, doc_vec in mapping["lessons"].items(): if course_text_id in doc_vec: doc_labels.append("L: {}".format(lesson_name)) break for item_name, doc_vec in mapping["items"].items(): if course_text_id in doc_vec: doc_labels.append("I: {}".format(item_name)) break llda_labels.append(doc_labels) llda_corpus.append(course_texts[course_text_id]) labelset = labelset.union(doc_labels) llda_model = LLDA(llda_alpha, llda_beta, K=len(llda_labels)) llda_model.set_corpus(llda_corpus, llda_labels) llda_model.train(iteration=llda_iterations) # phi = llda.phi() # for k, label in enumerate(labelset): # print ("\n-- label %d : %s" % (k + 1, label)) # for w in argsort(-phi[k + 1])[:10]: # print("%s: %.4f" % (llda.vocas[w], phi[k + 1,w])) return lda_model, hdp_model, at_model, llda_model, llda_labels
def run_hdp(self, modelId, **kwargs): print(kwargs) hdpModel = HdpModel(self, self.dict, kwargs) hdpData = {modelId: {'model':hdpModel, 'args':kwargs}} self.hdpModels.append(hdpData)
def test_hdp(): """Trains a HDP model and tests the html outputs.""" corpus, dictionary = get_corpus_dictionary() hdp = HdpModel(corpus, dictionary.id2token) data = gensim_models.prepare(hdp, corpus, dictionary) pyLDAvis.save_html(data, 'index_hdp.html') os.remove('index_hdp.html')
def train_hdp_model(corpus, dictionary, chunksize): print('HDP model') model = HdpModel(corpus=corpus, id2word=dictionary, chunksize=chunksize, random_state=config.SEED) # To get the topic words from the model topics = [] for topic_id, topic in model.show_topics(num_topics=10, formatted=False): topic = [word for word, _ in topic] topics.append(topic) return model
def topicsHDP(self, num_topics=-1, topn=20): # HdpModel(corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None) hdp = HdpModel(corpus=self.corpus, id2word=self.id2word) # show_topics(topics=20, topn=20, log=False, formatted=True) # Print the topN most probable words for topics number of topics. Set topics=-1 to print all topics. # Set formatted=True to return the topics as a list of strings, or False as lists of (weight, word) pairs. return hdp.show_topics(topics=num_topics, topn=topn, formatted=False)
def hierarchical_dirichlet_process(corpus, num_topics, id2word): ''' HIERARCHICAL DIRICHLET PROCESS # Advantage of HDP: fully unsupervised: can determine the ideal number of topics it needs through posterior inference ''' print 'Hierarchical Dirichlet Process' hdp_model = HdpModel(corpus = corpus, id2word = id2word) hdp_model.show_topics() hdp_topic = hdp_model.show_topics(formatted = False) return hdp_model
def build_hdp(self): """Builds an HDP model of the corpus. """ print("building HDP model...") start = time.time() self.hdp = HdpModel(corpus=self.get_bows(), id2word=self.dict) end = time.time() print("HDP finished! {:.2f} seconds".format(end - start))
def hdp(corpus,dictionary,docs,score=False): print('Traiing for {} documents ......'.format(len(corpus))) hdpmodel = HdpModel(corpus = corpus,id2word = dictionary) if score: print('calculating coherence socre for {} documents ......'.format(len(docs))) coherence_model = CoherenceModel(model=hdpmodel, texts=docs, dictionary=dictionary, coherence='c_v') coherence_score = coherence_model.get_coherence() print('\nCoherence Score: ', coherence_score) return hdpmodel,coherence_score return hdpmodel
def create_hdp(num_topic, dictionary): print("__________________________Create HDP_________________________") corpus, dic = generate_corpus(dictionary) hdpmodel = HdpModel(corpus=corpus, id2word=dic) topics = hdpmodel.print_topics(num_topics=num_topic, num_words=7) # see list of topics for topic in topics: print(topic) return hdpmodel
def get_topics(self, corpus, vocabulary, num_words=10): hdpmodel = HdpModel(corpus=corpus, id2word=vocabulary) # Docs say that if -1 all topics will be in result (ordered by significance). num_words is optional. # .print_topics(num_topics=20, num_words=10) # Docs are wrong. If you use -1 the list will be empty. So just don't specify the num_topics: topics = hdpmodel.show_topics(formatted=False, num_words=num_words, num_topics=-1) #print(hdpmodel.get_topics().shape) return topics
def model_pcs(self,model_name,LDA_topic_num=None): # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if self.mashup_only: if self.strict_train: train_corpus = self.train_mashup_dow else: train_corpus = self.mashup_dow else: if self.strict_train: train_corpus = self.train_mashup_dow + self.train_api_dow else: train_corpus = self.mashup_dow + self.api_dow if model_name=='HDP': self.model = HdpModel(train_corpus, self.dct) self.num_topics = self.model.get_topics ().shape[0] print('num_topics',self.num_topics) elif model_name=='TF_IDF': self.model =TfidfModel (train_corpus) self.num_topics=len(self.dct) elif model_name=='LDA': if LDA_topic_num is None: self.model = LdaModel(train_corpus) else: self.model = LdaModel(train_corpus,num_topics=LDA_topic_num) self.num_topics = self.model.get_topics ().shape[0] else: raise ValueError('wrong gensim_model name!') # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上) # print(self.mashup_dow) self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature # print(self.mashup_features) print('self.mashup_features, num:', len(self.mashup_features)) zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features]) print('zero_num1',zero_num1) for i in range(len(self.mashup_features)): if len(self.mashup_features[i])==0: print(self.mashup_dow[i]) self.api_features = [self.model[api_info] for api_info in self.api_dow] # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features))) self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics)) self._api_features = np.zeros((meta_data.api_num, self.num_topics)) for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array for index,value in self.mashup_features[i]: self._mashup_features[i][index]=value for i in range(meta_data.api_num): for index,value in self.api_features[i]: self._api_features[i][index]=value return self._mashup_features, self._api_features
def train_topics(args): print(f"Arguments: {args}") nlp = spacy.load("en", disable=["parser", "ner"]) files = args["text"] lines = extract_stories(files) def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): allowed_postags = set(allowed_postags) docs = nlp.pipe(texts) text_tokens = [] for doc in docs: tokens = [ token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.is_punct and not token.is_stop ] text_tokens.append(tokens) return text_tokens docs = tozenize(lines, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) print("Preprocessed Docs") bigram = gensim.models.Phrases(docs, min_count=5, threshold=100) trigram = gensim.models.Phrases(bigram[docs], threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] docs = make_bigrams(docs) docs = make_trigrams(docs) print("Create Dictionary") # Create Dictionary corpus_dict = corpora.Dictionary(docs) # Create Corpus texts = docs # Term Document Frequency corpus = [corpus_dict.doc2bow(text) for text in texts] print("Train Model") hdp = HdpModel(corpus, corpus_dict) print(hdp.print_topics(num_topics=50, num_words=20)) hdp.save(args["target"])
def stream_topic_model(self, topic: Topic, dictionary: corpora.Dictionary = None, corpus: IndexedCorpus = None, num_topics=20, max_topics_per_doc=5): # load dictionary and corpus, if necessary if not dictionary: dictionary = self.load_dictionary() logger.warning( "the default dictionary was loaded from file. " "You should keep an instance in memory instead of calling this in a loop..." ) if not corpus: corpus = JsonLinesCorpus(self.file_corpus) logger.warning( "the default corpus was loaded from file. You should provide a " "reduced corpus to increase performance (see corpus2corpus)") # build the model logger.info( "building a topic model with {} topics for {} documents in topic '{}'" .format(num_topics, len(corpus), topic.topic_id)) t0 = time.time() if self.model == "lda": model = LdaMulticore(corpus, id2word=dictionary.id2token, num_topics=num_topics, passes=2, iterations=50, chunksize=2000, workers=self.n_threads) elif self.model == "hdp": # T = overall topic limit, K = max topics per document model = HdpModel(corpus, id2word=dictionary.id2token, T=num_topics, K=max_topics_per_doc) else: raise ValueError("Unknown model identifier '{}'".format( self.model)) t1 = time.time() # serialize logger.info( "building the model took {:.1f} s. Serializing model...".format( t1 - t0)) output_path = self._get_model_path(topic) with util.open_by_ext(output_path, 'wb') as fp: pickle.dump(model, fp, protocol=4) logger.info( "model dump finished, took {:.1f} s".format(time.time() - t1))
def hdpmodel(self, corpus_t, save=False, savename=None): """ :param corpus_t: :param save: :param savename: :return: """ print('using Hierarchical Dirichlet Process model...') hdpmodel = HdpModel(corpus=corpus_t, id2word=self.word_dict) if save: print('输出hdp模型到文件:{}'.format(savename)) hdpmodel.save(savename) return hdpmodel
def _new_model(self, X=None, y=None): return HdpModel(X, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state)
def build_hdp_vec(docs, targets, dct=None, hdp=None): docs = [[str(o) for o in one] for one in docs] if dct is None: # train set dct = Dictionary(docs) for one in docs: dct.add_documents([[str(o) for o in one]]) copus = [dct.doc2bow(o) for o in docs] if hdp is None: # train hdp = HdpModel(copus, dct) v = [hdp[o] for o in copus] v_d = matutils.corpus2dense(v, num_terms=len(dct.token2id)).T return copus, v_d, targets, dct, hdp
def train_and_save_gensim_model(model_type_str, corpus, dct, file_name='model_300.model', num_topics=None): if model_type_str == "lsi": model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dct) elif model_type_str == "lda": model = LdaModel(corpus=corpus, alpha='auto', num_topics=num_topics, id2word=dct) elif model_type_str == "hdp": model = HdpModel(corpus=corpus, id2word=dct) model.save(file_name) return model
def get_hdp_model(doc_term_matrix, id2word, fname): if params['training']: hdp_model = HdpModel( corpus=doc_term_matrix, id2word=id2word, max_chunks=10000, chunksize=2000, kappa=0.6, tau=32.0, eta=0.05, ) _save_model('hdp', hdp_model, fname=fname) else: hdp_model = _load_model('hdp', fname) return hdp_model
def run_model(corpus, dictionary, method='LDA', num_topics=8): if method == 'LDA': ldamodel = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, iterations=400, passes=20) return ldamodel elif method == 'LSI': lsimodel = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) return lsimodel elif method == 'HDP': # extension of LDA when # of topic is unknown hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) return hdpmodel
def topic_modeling(paratext, typel='hdp', numtopics=20, npasses=100): doc_clean = [clean(doc).split() for doc in paratext] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # if (typel == 'hdp'): lda_model = HdpModel(corpus=doc_term_matrix, id2word=dictionary) elif (typel == 'lda'): lda_model = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics=numtopics, id2word=dictionary, passes=npasses, minimum_probability=0.0) else: print('invalid option in topic_modeling()') exit(1) return lda_model, dictionary, doc_term_matrix
def runModels(self, number_of_topics, corpus, dictionary, start, end): #do hdp model hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10) hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(hdptopics) #add results to total kept in a list # addToResults(result_dict) #output results self.printResults(number_of_topics, hdptopics, 'hdp', start, end) #d lda model ldamodel = LdaModel(corpus=corpus, num_topics=number_of_topics, id2word=dictionary, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) ldamodel.save('lda' + number_of_topics + '.model') ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(ldatopics) # addToResults(result_dict) self.printResults(number_of_topics, ldatopics, 'lda', start, end) visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) location = os.path.join(pn, 'topic_model_results') #visualize outputs in html pyLDAvis.save_html( visualisation, os.path.join( location, 'LDA_Visualization' + str(number_of_topics) + "_" + start + "_" + end + '.html'))
def gensimTopicModelingAnalysis(self, n): files = glob.glob( "/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/*.txt") files = sorted( files, key=lambda x: int( x.split( '/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/Cluster' )[1].split('_')[0])) with open("/Users/advaitbalaji/Desktop/ListofSortedClusters.txt", "w") as of: for f in files: of.writelines(f + "\n") texts, clusters = n.readMultipleFileLineWise(files) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print(hdpmodel.show_topics())
def model_pcs(self, model_name, LDA_topic_num=None): # 模型处理,返回mashup和api的特征:对同一个语料,可以先后使用不同的模型处理 # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if self.mashup_only: if self.strict_train: train_corpus = self.train_mashup_dow else: train_corpus = self.mashup_dow else: if self.strict_train: train_corpus = self.train_mashup_dow + self.api_dow else: train_corpus = self.mashup_dow + self.api_dow if model_name == 'HDP': self.model = HdpModel(train_corpus, self.dct) self.num_topics = self.model.get_topics().shape[0] print('num_topics', self.num_topics) elif model_name == 'TF_IDF': self.model = TfidfModel(train_corpus) self.num_topics = len(self.dct) elif model_name == 'LDA': if LDA_topic_num is None: self.model = LdaModel(train_corpus) else: self.model = LdaModel(train_corpus, num_topics=LDA_topic_num) self.num_topics = self.model.get_topics().shape[0] else: raise ValueError('wrong gensim_model name!') # 使用模型处理文本得到稀疏特征向量,再转化为标准的np格式(每个topic上都有) # *** 由于mashup_dow和api_dow默认是全部mashup/api的文本,所以得到的特征列表用全局性的id索引即可 *** self.mashup_features = [self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature self.api_features = [self.model[api_info] for api_info in self.api_dow] self.dense_mashup_features = np.zeros((data_repository.get_md().mashup_num, self.num_topics)) self.dense_api_features = np.zeros((data_repository.get_md().api_num, self.num_topics)) for i in range(data_repository.get_md().mashup_num): # 部分维度有值,需要转化成规范array for index, value in self.mashup_features[i]: self.dense_mashup_features[i][index] = value for i in range(data_repository.get_md().api_num): for index, value in self.api_features[i]: self.dense_api_features[i][index] = value return self.dense_mashup_features, self.dense_api_features
def comparison(texts): dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LSI Model output') print(lsimodel.show_topics()) hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print('hdp model output') print(hdpmodel.show_topics()) ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LDA Model output') print(ldamodel.show_topics()) pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence() def evaluate_bar_graph(coherences, indices): assert len(coherences) == len(indices) n = len(coherences) x = np.arange(n) plt.bar(x, coherences, width=0.2, tick_label=indices, align='center') plt.xlabel('Models') plt.ylabel('Coherence Value') plt.show() evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
def createHDP(self, fileName='', modelName=''): ''' fileName -> file for the dictionary (.dict) and corpus (.mm) files modelName -> model name for LDA to save to disk ldaPasses -~ number of passes, 10 default topicNum -> number of topics to generate, 100 by default ''' if fileName == '': fileName = self.__fileName if modelName == '': modelName = self.__fileName dict = corpora.Dictionary.load(self.__destination + fileName + '.dict') mm = corpora.MmCorpus(self.__destination + fileName + '.mm') hdp = HdpModel(corpus=mm, id2word=dict) hdp.save(self.__destination + modelName + '.hdp') print hdp print 'Created HDP model %s' % self.__fileName
def extract_topic_model(corpus, dictionary, model, num_topics): """ Extract topic model """ if model == 'lsi': lsimodel = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) return lsimodel elif model == 'lda': ldamodel = LdaModel(corpus = corpus, id2word=dictionary, \ alpha='auto', eta='auto', \ iterations=800, num_topics= num_topics, \ passes = 20, eval_every= None) return ldamodel elif model == 'hdp': hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) return hdpmodel
def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True, model='lda'): """ Trian the topic cluster model. Input value: data: pd.DataFrame format ['id','title','content','summary'] num_topics: (int) the number of topics iterations: (int) total number of iteration times example: >>> lda = LDA_Model >>> lda.train(text) """ data = load_data(str(path + '/output/data.csv')) self.original_data = data self.text = list(data['content']) self.num_topics = num_topics self.iterations = iterations self.model_name = model print('preprocessing...') self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words) self.id2word = Dictionary(self.token) self.corpus = [self.id2word.doc2bow(text) for text in self.token] if tfidf == True: print('calculate tfidf...') tfidf_model = TfidfModel(self.corpus) self.corpus = tfidf_model[self.corpus] if model == 'lda': self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics, iterations=self.iterations) if model == 'lsi': self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics) if model == 'hdp': self.model = HdpModel(corpus=self.corpus, id2word=self.id2word) self.num_topics = self.model.get_topics().shape[0] self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words']) self.doc_topic = self._doc_topic() self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id']) self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])