def main(query,output_filename,window=50,topicn=50): print ('Training nmf model began') frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) query_parameters = [(i, values[i]) for i in args] document_collection_original=blacklab.search_blacklab(query,window=window,lemma=True, include_match=False) print ("Search finished") document_collection=[match['complete_match'].strip() for match in document_collection_original[0:100]] #Use the phraser model phraser_model = Phraser(Phrases.load(constants.OUTPUT_FOLDER+'phrase_model')) document_collection=[' '.join(phraser_model[match['complete_match'].strip().split()]) for match in document_collection_original] print ("Phraser model done") #get rid of stop words document_collection_filtered = document_collection ''' for text in document_collection: new_text = [] for word in text.split(): if (word not in set(stopwords.words('english')) and (word[0] in string.ascii_uppercase + string.ascii_lowercase)): new_text.append(word) document_collection_filtered.append(' '.join(new_text)) ''' print ("Filtering done") #build the corpus preprocessed_corpus = [] for i,text in enumerate(document_collection_filtered): if i==0: print (i) text = text.split() dct=gensim_utils.initialize_gensim_dictionary([text]) else: print (i) text = text.split() gensim_utils.add_documents_to_gensim_dictionary(dct,[text]) #Filter it here dct.filter_extremes(no_below=10, no_above=0.95) gensim_corpus = [dct.doc2bow(bag_of_word.split()) for bag_of_word in document_collection_filtered] #text = document_collection_filtered[0].split() nmf = Nmf(gensim_corpus, num_topics=50) words = list(dct.token2id.keys()) topics = nmf.print_topics(50) for topic in topics: topic_words = topic[1].split('+') print_topic = [] for topic_word in topic_words: print_topic.append(words[int(topic_word.split('*')[1][1:].strip()[:-1])]) print (' '.join(print_topic)) #get topic of a given document: nmf.get_document_topics(gensim_corpus[0]) #dct.token2id.keys() #nmf.show_topic(10) #nmf.get_document_topics(dct.doc2bow(preprocessed_corpus[0])) pdb.set_trace()
class TopicModel(object): def __init__(self): self.__corpus = None self.__modelName = None self.__model = None self.__modelFile = 'results/model.bin' self.__coherenceModel = None def setCorpus(self, corpus): self.__corpus = corpus def getCoherence(self): return self.__coherenceModel.get_coherence() def getDocumentTopics(self, document, threshold=None): return self.__model.get_document_topics(document, threshold) def build(self, model_name, num_topics, chunksize, passes, corpus=None): self.__modelName = model_name # Update corpus if necessary if isinstance(corpus, Corpus): self.__corpus = corpus # Build topic model if model_name == 'lda': self.__buildLDA(num_topics, chunksize, passes) elif model_name == 'nmf': self.__buildNMF(num_topics, chunksize, passes) # Build coherence model self.__buildCoherenceModel() def __buildLDA(self, num_topics, chunksize, passes): self.__model = LdaMulticore(self.__corpus, id2word=self.__corpus.getDictionary(), num_topics=num_topics, chunksize=chunksize, passes=passes, eval_every=None, workers=40, random_state=10) def __buildNMF(self, num_topics, chunksize, passes): self.__model = Nmf(self.__corpus, id2word=self.__corpus.getDictionary(), num_topics=num_topics, chunksize=chunksize, passes=passes, eval_every=None, random_state=10) def __buildCoherenceModel(self): self.__coherenceModel = CoherenceModel(model=self.__model, texts=self.__corpus.getTexts(), coherence='c_v', processes=7) def __printTopics(self): print(' Topics') for idx, topic in self.__model.print_topics(-1): print(' {}: {}'.format(idx, topic)) def save(self): self.__model.save(self.__modelFile) def load(self, model_name): self.__modelName = model_name if model_name == 'lda': self.__model = LdaMulticore.load(self.__modelFile) elif model_name == 'nmf': self.__model = Nmf.load(self.__modelFile)