def topic_model(self): """ Calculate the topic model for all the units, the probability that the comment has each of the topics :return: pandas DF[number_of_units, number_of_topics] - the probability for each comment and topic """ print('{}: Start topic model'.format( (time.asctime(time.localtime(time.time()))))) # Clean the data print('{}: Clean the data'.format( (time.asctime(time.localtime(time.time()))))) data_clean = { row['submission_id']: clean(row['submission_body']).split() for index, row in self.embedded_submission_text.iterrows() } # Creating the term dictionary of our corpus, where every unique term is assigned an index. print('{}: Create the dictionary'.format( time.asctime(time.localtime(time.time())))) dictionary = gensim.corpora.Dictionary(data_clean.values()) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. print('{}: Create data term matrix'.format( time.asctime(time.localtime(time.time())))) data_term_matrix = { index: dictionary.doc2bow(doc) for index, doc in data_clean.items() } # Get topics for the data print('{}: Predict topics'.format( time.asctime(time.localtime(time.time())))) lda_model = ldamodel.LdaTransformer(num_topics=self.number_of_topics, id2word=dictionary, passes=50, minimum_probability=0) result = lda_model.transform(list(data_term_matrix.values())) print('{}: Create final topic model'.format( time.asctime(time.localtime(time.time())))) comment_ids_df = pd.DataFrame(list(data_term_matrix.keys()), columns=['submission_id']) result_columns = [i for i in range(self.number_of_topics)] topic_model_result_df = pd.DataFrame(result, columns=result_columns) print('{}: Save final topic model'.format( time.asctime(time.localtime(time.time())))) topic_model_final_result = pd.concat( [comment_ids_df, topic_model_result_df], axis=1) print('{}: Finish topic model'.format( (time.asctime(time.localtime(time.time()))))) return topic_model_final_result
def __init__(self, n_topics=50, estimator='LDA'): """ n_topics is the desired number of topics To use Latent Semantic Analysis, set estimator to 'LSA' otherwise defaults to Latent Dirichlet Allocation. """ self.n_topics = n_topics if estimator == 'LSA': self.estimator = lsimodel.LsiTransformer(num_topics=self.n_topics) else: self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics) self.model = Pipeline([('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('model', self.estimator)])
def __init__(self, n_topics=100, update_every=0, passes=1, alpha="auto", scorer="perplexity", include_bigram=False, bigram_path="", stopwords=[]): self.n_topics = n_topics self.model = Pipeline([ ('norm', TextNormalizer(include_bigram=include_bigram, bigram_path=bigram_path, stopwords=stopwords)), ('vect', GensimOneHotVectorizer()), ('model', ldamodel.LdaTransformer(num_topics=self.n_topics, update_every=update_every, passes=passes, alpha=alpha, scorer=scorer)) ]) self.search = None
def topic_model(self): # Clean the data print('{}: Clean the data'.format( (time.asctime(time.localtime(time.time()))))) units_clean = { row['comment_id']: self.clean(row['comment_body']).split() for index, row in self.units.iterrows() } all_data_clean = { row['comment_id']: self.clean(row['comment_body']).split() for index, row in self.all_data.iterrows() } # Creating the term dictionary of our corpus, where every unique term is assigned an index. print('{}: Create the dictionary'.format( (time.asctime(time.localtime(time.time()))))) dictionary = corpora.Dictionary(all_data_clean.values()) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. print('{}: Create units term matrix'.format( (time.asctime(time.localtime(time.time()))))) units_term_matrix = { index: dictionary.doc2bow(doc) for index, doc in units_clean.items() } print('{}: Create all data term matrix'.format( (time.asctime(time.localtime(time.time()))))) all_data_term_matrix = { index: dictionary.doc2bow(doc) for index, doc in all_data_clean.items() } # Create LDA model print('{}: Create model'.format( (time.asctime(time.localtime(time.time()))))) model = ldamodel.LdaTransformer(num_topics=self.number_of_topics, id2word=dictionary, passes=50, minimum_probability=0) # Train LDA model on the comments term matrix. print('{}: Fit the model on all data'.format( (time.asctime(time.localtime(time.time()))))) model = model.fit(list(all_data_term_matrix.values())) # Get topics for the data print('{}: Predict topics for units'.format( (time.asctime(time.localtime(time.time()))))) result = model.transform(list(units_term_matrix.values())) print('{}: Create final topic model data'.format( (time.asctime(time.localtime(time.time()))))) comment_ids_df = pd.DataFrame(list(units_term_matrix.keys()), columns=['comment_id']) result_columns = [ 'topic_model_' + str(i) for i in range(self.number_of_topics) ] topic_model_result_df = pd.DataFrame(result, columns=result_columns) print('{}: Save final topic model data'.format( (time.asctime(time.localtime(time.time()))))) topic_model_final_result = pd.concat( [comment_ids_df, topic_model_result_df], axis=1) topic_model_final_result.to_csv( os.path.join(data_directory, 'topic_model_CMV.csv'))
self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])