def getRelationDetailByHDP(sentence_list): # 聚类获取结果 corpus = [] pairs_all, position_all = segmentor.segListWithNerTag(sentence_list) words_list = [] for pairs in pairs_all: word_list = [] for pair in pairs: if pair.flag.__contains__("v") or pair.flag.__contains__("n"): word_list.append(pair.word) words_list.append(word_list) # words_list = list(map(lambda pairs: map(lambda x: x.word, pairs), pairs_all)) from gensim import corpora dictionary = corpora.Dictionary(words_list) for words in words_list: corpus.append(dictionary.doc2bow(words)) from gensim.models import HdpModel hdp = HdpModel(corpus, dictionary) a = hdp.print_topics() words = {} for topic in a: word_details = str(topic[1]).split(" + ") for word_detail in word_details: word = str(word_detail[word_detail.index("*") + 1:]) num = float(str(word_detail[:word_detail.index("*")])) if not (words.__contains__(word)): words[word] = num else: words[word] += num words = sorted(words.items(), key=lambda d: d[1]) return words # 后获取句法分析中的高频动词名词)
def fit(self, df_original, topics): #Create Dictionary self.dictionary = self._create_dictionary(df_original) #Create corpus self.corpus = self._create_corpus(df_original) #Train Model hdp = HdpModel(self.corpus, id2word=self.dictionary, T=topics) self.model = hdp.suggested_lda_model() feature_vecs = [] for i in range(len(self.corpus)): top_topics = self.model.get_document_topics( self.corpus[i], minimum_probability=0.0) topic_vec = [0] * topics for j in top_topics: index = j[0] topic_vec[index] = j[1] feature_vecs.append(topic_vec) df_lda_reduced = pd.DataFrame(feature_vecs, columns=list(range(len( feature_vecs[0])))) df_lda_reduced.insert(0, 'Name', list(df_original['Name'].values), False) df_lda_reduced = df_lda_reduced.sort_values(by=['Name']) return df_lda_reduced
def hierarchical_dirichlet_process(corpus, num_topics, id2word): ''' HIERARCHICAL DIRICHLET PROCESS # Advantage of HDP: fully unsupervised: can determine the ideal number of topics it needs through posterior inference ''' print 'Hierarchical Dirichlet Process' hdp_model = HdpModel(corpus = corpus, id2word = id2word) hdp_model.show_topics() hdp_topic = hdp_model.show_topics(formatted = False) return hdp_model
def train_hdp_model(corpus, dictionary, chunksize): print('HDP model') model = HdpModel(corpus=corpus, id2word=dictionary, chunksize=chunksize, random_state=config.SEED) # To get the topic words from the model topics = [] for topic_id, topic in model.show_topics(num_topics=10, formatted=False): topic = [word for word, _ in topic] topics.append(topic) return model
def topicsHDP(self, num_topics=-1, topn=20): # HdpModel(corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None) hdp = HdpModel(corpus=self.corpus, id2word=self.id2word) # show_topics(topics=20, topn=20, log=False, formatted=True) # Print the topN most probable words for topics number of topics. Set topics=-1 to print all topics. # Set formatted=True to return the topics as a list of strings, or False as lists of (weight, word) pairs. return hdp.show_topics(topics=num_topics, topn=topn, formatted=False)
def create_hdp(num_topic, dictionary): print("__________________________Create HDP_________________________") corpus, dic = generate_corpus(dictionary) hdpmodel = HdpModel(corpus=corpus, id2word=dic) topics = hdpmodel.print_topics(num_topics=num_topic, num_words=7) # see list of topics for topic in topics: print(topic) return hdpmodel
def get_topics(self, corpus, vocabulary, num_words=10): hdpmodel = HdpModel(corpus=corpus, id2word=vocabulary) # Docs say that if -1 all topics will be in result (ordered by significance). num_words is optional. # .print_topics(num_topics=20, num_words=10) # Docs are wrong. If you use -1 the list will be empty. So just don't specify the num_topics: topics = hdpmodel.show_topics(formatted=False, num_words=num_words, num_topics=-1) #print(hdpmodel.get_topics().shape) return topics
def train_topics(args): print(f"Arguments: {args}") nlp = spacy.load("en", disable=["parser", "ner"]) files = args["text"] lines = extract_stories(files) def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): allowed_postags = set(allowed_postags) docs = nlp.pipe(texts) text_tokens = [] for doc in docs: tokens = [ token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.is_punct and not token.is_stop ] text_tokens.append(tokens) return text_tokens docs = tozenize(lines, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) print("Preprocessed Docs") bigram = gensim.models.Phrases(docs, min_count=5, threshold=100) trigram = gensim.models.Phrases(bigram[docs], threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] docs = make_bigrams(docs) docs = make_trigrams(docs) print("Create Dictionary") # Create Dictionary corpus_dict = corpora.Dictionary(docs) # Create Corpus texts = docs # Term Document Frequency corpus = [corpus_dict.doc2bow(text) for text in texts] print("Train Model") hdp = HdpModel(corpus, corpus_dict) print(hdp.print_topics(num_topics=50, num_words=20)) hdp.save(args["target"])
class HDPModel(Model, Transformer): def __init__(self, corpus=None, **kwargs): self._m = HdpModel(corpus, **kwargs) def fit(self, corpus): self._m.update(corpus) def transform(self, corpus): return self._m[corpus] @property def inst(self): return self._m
def hdpmodel(self, corpus_t, save=False, savename=None): """ :param corpus_t: :param save: :param savename: :return: """ print('using Hierarchical Dirichlet Process model...') hdpmodel = HdpModel(corpus=corpus_t, id2word=self.word_dict) if save: print('输出hdp模型到文件:{}'.format(savename)) hdpmodel.save(savename) return hdpmodel
def load(self, path='default'): """ :param path: the path of trained model. :return: """ if path == 'default': path = 'model' file_list = os.listdir(path) for file in file_list: if file.endswith('.model'): self.model_name = file.split('.')[0] if self.model_name == 'lda': self.model = LdaModel.load(str(path + '/lda.model')) if self.model_name == 'lsi': self.model = LsiModel.load(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model = HdpModel.load(str(path + '/hdp.model')) self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.model.get_topics().shape[0] else: self.num_topics = self.model.num_topics #self.iterations = self.model.iterations f = open(str(path + '/original_data.pickle'), 'rb') self.original_data = pickle.load(f) f.close() f = open(str(path + '/text.pickle'), 'rb') self.text = pickle.load(f) f.close() f = open(str(path + '/token.pickle'), 'rb') self.token = pickle.load(f) f.close() f = open(str(path + '/corpus.pickle'), 'rb') self.corpus = pickle.load(f) f.close() path = path + '/result' f = open(str(path + '/topic_key.pickle'), 'rb') self.topic_key = pickle.load(f) f.close() f = open(str(path + '/doc_topic.pickle'), 'rb') self.doc_topic = pickle.load(f) f.close() f = open(str(path + '/topic_doc.pickle'), 'rb') self.topic_doc = pickle.load(f) f.close() f = open(str(path + '/topic_sent.pickle'), 'rb') self.topic_sent = pickle.load(f) f.close() self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.topic_doc.shape[0] else: self.num_topics = self.model.num_topics
def model_pcs(self, model_name, all_mashup_num, all_api_num): # 按照0-all——num得到的其实是按真实id的映射!!! # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if model_name == 'HDP': self.model = HdpModel(self.mashup_dow + self.api_dow, self.dct) self.num_topics = self.model.get_topics().shape[0] elif model_name == 'TF_IDF': self.model = TfidfModel(self.mashup_dow + self.api_dow) self.num_topics = len(self.dct) else: raise ValueError('wrong gensim_model name!') mashup_hdp_features = [ self.model[mashup_info] for mashup_info in self.mashup_dow ] api_hdp_features = [self.model[api_info] for api_info in self.api_dow] self._mashup_hdp_features = np.zeros((all_mashup_num, self.num_topics)) self._api_hdp_features = np.zeros((all_api_num, self.num_topics)) for i in range(all_mashup_num): for index, value in mashup_hdp_features[i]: self._mashup_hdp_features[i][index] = value for i in range(all_api_num): for index, value in api_hdp_features[i]: self._api_hdp_features[i][index] = value return self._mashup_hdp_features, self._api_hdp_features
def run(self, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=0, *args, **kwargs): self.model = HdpModel(corpus=self.corpus, id2word=self.dictionary, kappa=kappa, tau=tau, K=K, T=T, alpha=alpha, gamma=gamma, eta=eta, scale=scale, var_converge=var_converge, outputdir=outputdir, random_state=random_state, *args, **kwargs) print("Done!\nCheckout hdp.model")
def build_lda_models(course_corpus, course_dictionary, mapping, course_texts): # ==== Train Unsupervised LDA ==== lda_model = LdaModel(corpus=course_corpus, id2word=course_dictionary) # ==== Train Unsupervised HDP-LDA ==== hdp_model = HdpModel(corpus=course_corpus, id2word=course_dictionary) # ==== Train Author Topic Model ==== author_to_doc = {} # author topic LDA (authors are modules,lessons,items) for author_type in ["modules", "lessons", "items"]: entity_to_doc = mapping[author_type] for entity_name, entity_docs in entity_to_doc.items(): author_to_doc["{}: {}".format(author_type[0].capitalize(), entity_name)] = entity_docs at_model = AuthorTopicModel(corpus=course_corpus, id2word=course_dictionary, author2doc=author_to_doc) # ==== Train Labeled LDA ==== # explicitly supervised, labeled LDA llda_alpha = 0.01 llda_beta = 0.001 llda_iterations = 50 llda_labels = [] llda_corpus = [] labelset = set() for course_text_id in range(0, len(course_texts)): doc_labels = [] # get module label name for module_name, doc_vec in mapping["modules"].items(): if course_text_id in doc_vec: doc_labels.append("M: {}".format(module_name)) break # get lesson label name for lesson_name, doc_vec in mapping["lessons"].items(): if course_text_id in doc_vec: doc_labels.append("L: {}".format(lesson_name)) break for item_name, doc_vec in mapping["items"].items(): if course_text_id in doc_vec: doc_labels.append("I: {}".format(item_name)) break llda_labels.append(doc_labels) llda_corpus.append(course_texts[course_text_id]) labelset = labelset.union(doc_labels) llda_model = LLDA(llda_alpha, llda_beta, K=len(llda_labels)) llda_model.set_corpus(llda_corpus, llda_labels) llda_model.train(iteration=llda_iterations) # phi = llda.phi() # for k, label in enumerate(labelset): # print ("\n-- label %d : %s" % (k + 1, label)) # for w in argsort(-phi[k + 1])[:10]: # print("%s: %.4f" % (llda.vocas[w], phi[k + 1,w])) return lda_model, hdp_model, at_model, llda_model, llda_labels
def run_hdp(self, modelId, **kwargs): print(kwargs) hdpModel = HdpModel(self, self.dict, kwargs) hdpData = {modelId: {'model':hdpModel, 'args':kwargs}} self.hdpModels.append(hdpData)
def test_hdp(): """Trains a HDP model and tests the html outputs.""" corpus, dictionary = get_corpus_dictionary() hdp = HdpModel(corpus, dictionary.id2token) data = gensim_models.prepare(hdp, corpus, dictionary) pyLDAvis.save_html(data, 'index_hdp.html') os.remove('index_hdp.html')
def runModels(self, number_of_topics, corpus, dictionary, start, end): #do hdp model hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10) hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(hdptopics) #add results to total kept in a list # addToResults(result_dict) #output results self.printResults(number_of_topics, hdptopics, 'hdp', start, end) #d lda model ldamodel = LdaModel(corpus=corpus, num_topics=number_of_topics, id2word=dictionary, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) ldamodel.save('lda' + number_of_topics + '.model') ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(ldatopics) # addToResults(result_dict) self.printResults(number_of_topics, ldatopics, 'lda', start, end) visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) location = os.path.join(pn, 'topic_model_results') #visualize outputs in html pyLDAvis.save_html( visualisation, os.path.join( location, 'LDA_Visualization' + str(number_of_topics) + "_" + start + "_" + end + '.html'))
def gensimTopicModelingAnalysis(self, n): files = glob.glob( "/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/*.txt") files = sorted( files, key=lambda x: int( x.split( '/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/Cluster' )[1].split('_')[0])) with open("/Users/advaitbalaji/Desktop/ListofSortedClusters.txt", "w") as of: for f in files: of.writelines(f + "\n") texts, clusters = n.readMultipleFileLineWise(files) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print(hdpmodel.show_topics())
def build_hdp(self): """Builds an HDP model of the corpus. """ print("building HDP model...") start = time.time() self.hdp = HdpModel(corpus=self.get_bows(), id2word=self.dict) end = time.time() print("HDP finished! {:.2f} seconds".format(end - start))
def hdp(corpus,dictionary,docs,score=False): print('Traiing for {} documents ......'.format(len(corpus))) hdpmodel = HdpModel(corpus = corpus,id2word = dictionary) if score: print('calculating coherence socre for {} documents ......'.format(len(docs))) coherence_model = CoherenceModel(model=hdpmodel, texts=docs, dictionary=dictionary, coherence='c_v') coherence_score = coherence_model.get_coherence() print('\nCoherence Score: ', coherence_score) return hdpmodel,coherence_score return hdpmodel
def comparison(texts): dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LSI Model output') print(lsimodel.show_topics()) hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print('hdp model output') print(hdpmodel.show_topics()) ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LDA Model output') print(ldamodel.show_topics()) pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence() def evaluate_bar_graph(coherences, indices): assert len(coherences) == len(indices) n = len(coherences) x = np.arange(n) plt.bar(x, coherences, width=0.2, tick_label=indices, align='center') plt.xlabel('Models') plt.ylabel('Coherence Value') plt.show() evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
def createHDP(self, fileName = '', modelName= ''): ''' fileName -> file for the dictionary (.dict) and corpus (.mm) files modelName -> model name for LDA to save to disk ldaPasses -~ number of passes, 10 default topicNum -> number of topics to generate, 100 by default ''' if fileName == '': fileName = self.__fileName if modelName == '': modelName = self.__fileName dict = corpora.Dictionary.load(self.__destination+fileName+'.dict') mm = corpora.MmCorpus(self.__destination+fileName+'.mm') hdp = HdpModel(corpus=mm, id2word=dict) hdp.save(self.__destination+modelName+'.hdp') print hdp print 'Created HDP model %s'%self.__fileName
def createHDP(self, fileName='', modelName=''): ''' fileName -> file for the dictionary (.dict) and corpus (.mm) files modelName -> model name for LDA to save to disk ldaPasses -~ number of passes, 10 default topicNum -> number of topics to generate, 100 by default ''' if fileName == '': fileName = self.__fileName if modelName == '': modelName = self.__fileName dict = corpora.Dictionary.load(self.__destination + fileName + '.dict') mm = corpora.MmCorpus(self.__destination + fileName + '.mm') hdp = HdpModel(corpus=mm, id2word=dict) hdp.save(self.__destination + modelName + '.hdp') print hdp print 'Created HDP model %s' % self.__fileName
def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True, model='lda'): """ Trian the topic cluster model. Input value: data: pd.DataFrame format ['id','title','content','summary'] num_topics: (int) the number of topics iterations: (int) total number of iteration times example: >>> lda = LDA_Model >>> lda.train(text) """ data = load_data(str(path + '/output/data.csv')) self.original_data = data self.text = list(data['content']) self.num_topics = num_topics self.iterations = iterations self.model_name = model print('preprocessing...') self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words) self.id2word = Dictionary(self.token) self.corpus = [self.id2word.doc2bow(text) for text in self.token] if tfidf == True: print('calculate tfidf...') tfidf_model = TfidfModel(self.corpus) self.corpus = tfidf_model[self.corpus] if model == 'lda': self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics, iterations=self.iterations) if model == 'lsi': self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics) if model == 'hdp': self.model = HdpModel(corpus=self.corpus, id2word=self.id2word) self.num_topics = self.model.get_topics().shape[0] self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words']) self.doc_topic = self._doc_topic() self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id']) self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])
def add_topics(args): print(args) nlp = spacy.load("en", disable=["parser", "ner"]) def tozenize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): allowed_postags = set(allowed_postags) docs = nlp.pipe(texts) text_tokens = [] for doc in docs: tokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags and not token.is_punct and not token.is_stop] text_tokens.append(tokens) return text_tokens model = HdpModel.load(args["topic_model"]) corpus_dict = model.id2word topics = model.show_topics(num_topics=args["num_topics"], num_words=args["num_terms"], log=False, formatted=False) topics_to_save = [] for topic in topics: topic_dict = {} topic_terms = ", ".join([t[0] for t in topic[1]]) topic_dict["topic_id"] = int(topic[0]) topic_dict["terms"] = topic_terms topics_to_save.append(topic_dict) database = args["database"] dataset_db = f"sqlite:///{database}" with dataset.connect(dataset_db, engine_kwargs=engine_kwargs) as db: db.create_table("corpus_topics") topic_ids = db["corpus_topics"].insert_many(topics_to_save) print(topic_ids) print(topics_to_save) batch = [] for sentence in db['sentence']: batch.append(sentence) if len(batch) == args["batch_size"]: insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize) batch = [] if len(batch) > 0: insert_corpus_sentence_links(batch, corpus_dict, db, model, tozenize) db["corpus_topics_sentences"].create_index(['sentence_id']) db["corpus_topics_sentences"].create_index(['topic_id'])
def model_pcs(self,model_name,LDA_topic_num=None): # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if self.mashup_only: if self.strict_train: train_corpus = self.train_mashup_dow else: train_corpus = self.mashup_dow else: if self.strict_train: train_corpus = self.train_mashup_dow + self.train_api_dow else: train_corpus = self.mashup_dow + self.api_dow if model_name=='HDP': self.model = HdpModel(train_corpus, self.dct) self.num_topics = self.model.get_topics ().shape[0] print('num_topics',self.num_topics) elif model_name=='TF_IDF': self.model =TfidfModel (train_corpus) self.num_topics=len(self.dct) elif model_name=='LDA': if LDA_topic_num is None: self.model = LdaModel(train_corpus) else: self.model = LdaModel(train_corpus,num_topics=LDA_topic_num) self.num_topics = self.model.get_topics ().shape[0] else: raise ValueError('wrong gensim_model name!') # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上) # print(self.mashup_dow) self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature # print(self.mashup_features) print('self.mashup_features, num:', len(self.mashup_features)) zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features]) print('zero_num1',zero_num1) for i in range(len(self.mashup_features)): if len(self.mashup_features[i])==0: print(self.mashup_dow[i]) self.api_features = [self.model[api_info] for api_info in self.api_dow] # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features))) self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics)) self._api_features = np.zeros((meta_data.api_num, self.num_topics)) for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array for index,value in self.mashup_features[i]: self._mashup_features[i][index]=value for i in range(meta_data.api_num): for index,value in self.api_features[i]: self._api_features[i][index]=value return self._mashup_features, self._api_features
def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, model_file_path: str, language_processed_data: list): # Make a index to word dictionary. logging.info("---- Creating HDP model") temp = self.essentials.dictionary[0] model = HdpModel(corpus=self.essentials.corpus, id2word=self.essentials.dictionary.id2token) # , alpha="symmetric", # eta=self.beta, chunksize=self.chunk_size) model.save(model_file_path) self.model = model logging.info("---- HDP model is created") metrics = self.get_model_evaluation_metrics(language_processed_data) parameters = self.get_model_parameters() self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version, param_name, param_version, metrics, parameters) return
def try_news_cluster(): docs = feed_doc() df_threshold_lower = 50 df_threshold_upper = 500 dictionary = corpora.Dictionary(doc for doc in docs) print 'dictionary ready' low_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= df_threshold_lower ] high_df = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq > df_threshold_upper ] dictionary.filter_tokens(low_df + high_df) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in feed_doc()] print 'corpus ready' hdp = HdpModel(corpus, dictionary) for topic in hdp.print_topics(num_topics=50, num_words=20): print topic
def stream_topic_model(self, topic: Topic, dictionary: corpora.Dictionary = None, corpus: IndexedCorpus = None, num_topics=20, max_topics_per_doc=5): # load dictionary and corpus, if necessary if not dictionary: dictionary = self.load_dictionary() logger.warning( "the default dictionary was loaded from file. " "You should keep an instance in memory instead of calling this in a loop..." ) if not corpus: corpus = JsonLinesCorpus(self.file_corpus) logger.warning( "the default corpus was loaded from file. You should provide a " "reduced corpus to increase performance (see corpus2corpus)") # build the model logger.info( "building a topic model with {} topics for {} documents in topic '{}'" .format(num_topics, len(corpus), topic.topic_id)) t0 = time.time() if self.model == "lda": model = LdaMulticore(corpus, id2word=dictionary.id2token, num_topics=num_topics, passes=2, iterations=50, chunksize=2000, workers=self.n_threads) elif self.model == "hdp": # T = overall topic limit, K = max topics per document model = HdpModel(corpus, id2word=dictionary.id2token, T=num_topics, K=max_topics_per_doc) else: raise ValueError("Unknown model identifier '{}'".format( self.model)) t1 = time.time() # serialize logger.info( "building the model took {:.1f} s. Serializing model...".format( t1 - t0)) output_path = self._get_model_path(topic) with util.open_by_ext(output_path, 'wb') as fp: pickle.dump(model, fp, protocol=4) logger.info( "model dump finished, took {:.1f} s".format(time.time() - t1))
def hierarchical_dirichlet_process_topic_extraction(): """ Function performs topic extraction on Tweets using the Gensim HDP model. :return: None. """ from gensim.models import HdpModel # LDA can only use raw term counts for LDA because it is a probabilistic graphical model. tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english') tf = tf_vectorizer.fit_transform(slo_feature_series) tf_feature_names = tf_vectorizer.get_feature_names() log.info( "\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix." ) log.info(f"{tf}\n") log.info( "\n.get_feature_names - Array mapping from feature integer indices to feature name" ) log.info(f"{tf_feature_names}\n") # Train the HDP model. hdp = HdpModel(corpus, dictionary) time.sleep(3) # # For use as wrapper with Scikit-Learn API. # model = HdpTransformer(id2word=dictionary) # distribution = model.fit_transform(corpus) # Display the top words for each topic. topic_info = hdp.print_topics(num_topics=20, num_words=10) for topic in topic_info: print(topic)
def get_num_topics(self): self.rev_train['title'] = self.strip_newline(self.rev_train.title) self.rev_test['title'] = self.strip_newline(self.rev_test.title) # rev_train.text[21:22].values words_tr = list(self.sent_to_words(self.rev_train.title)) words_te = list(self.sent_to_words(self.rev_test.title)) words_tr = self.remove_stopwords(words_tr) bigram_tr, trigram_tr = self.bigrams(words_tr) trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr] lemma_lg = self.lemmatization(trigrams_tr) with open(os.path.join('.', 'data', 'lemma_lg.pkl'), 'wb') as f: pickle.dump(lemma_lg, f) id2word_lg = gensim.corpora.Dictionary(lemma_lg) id2word_lg.filter_extremes(no_below=2, no_above=0.6) id2word_lg.compactify() id2word_lg.save(os.path.join('.', 'data', 'train_dict_lg')) corpus_lg = [id2word_lg.doc2bow(text) for text in lemma_lg] with open(os.path.join('.', 'data', 'corpus_lg.pkl'), 'wb') as f: pickle.dump(corpus_lg, f) hdp = HdpModel(corpus_lg, id2word_lg, chunksize=100) n_topics = len(hdp.print_topics()) hdptopics = hdp.print_topics(num_topics=n_topics) for tp in hdptopics: print(tp) return n_topics
id2word=dictionary, update_every=5, chunksize=10000, passes=100) lda.save('/tmp/model.lda') else: lda = LdaModel.load('/tmp/model.lda') lda.show_topics() topics_matrix = lda.show_topics(formatted=False, num_words=7) print(topics_matrix) print(len(topics_matrix)) for topic in topics_matrix: i = topic[1] print([str(word) for word in i]) # # topics_matrix = np.array(topics_matrix) # # topic_words = topics_matrix[:, :, 1] # for i in topic_words: # print([str(word) for word in i]) # otro modelo mas para categorizar documentos, Hierarchical Dirichlet Process print("HDP") model = HdpModel(corpus, id2word=dictionary) model.show_topics(log=True, topics=5) # ver https://radimrehurek.com/gensim/tut2.html
__author__ = 'rbshaffer' from gensim.models import HdpModel from gensim.corpora import BleiCorpus from gensim.corpora import Dictionary corpus = BleiCorpus(fname='/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_corpus_07242015.lda-c', fname_vocab='/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_corpus_07242015.lda-c.vocab') dictionary = Dictionary.load('/home/rbshaffer/PycharmProjects/Constitution_Similarity/const_dic_07242015.lda-c.dic') hdp_model = HdpModel(corpus=corpus, id2word=dictionary, max_time=28800) hdp_model.save('/home/rbshaffer/Desktop/hdp_output_0726015.pydata')