class MyModel: def __init__(self, dict_file=None, corpus_model=None, corpus_file=None): self.dict_file = dict_file self.dictionary = None self.corpus = None if dict_file is not None: self.dictionary = corpora.Dictionary.load(dict_file) if corpus_model: self.corpus = self.corpus_model elif corpus_file: self.corpus = corpora.MmCorpus(corpus_file) self.tf_idf_model = None self.corpus_tf_idf = None self.lsi_model = None self.corpus_lsi = None self.lda_model = None self.corpus_lda = None def tf_idf(self): self.tf_idf_model = models.TfidfModel(corpus=self.corpus, normalize=True) # corpus_vector = [vector for vector in self.corpus] self.corpus_tf_idf = self.tf_idf_model[self.corpus] def lsi(self): self.tf_idf() if self.corpus_tf_idf and self.dictionary: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] print self.lsi_model.print_topic(2) elif self.corpus_tf_idf: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] def lda(self): self.lda_model = models.LsiModel(corpus=self.corpus) self.corpus_lda = self.lda_model[self.corpus] def add_document_lsi(self, addition_corpus_tf_idf, addition_vector_tf_idf): self.lsi_model.add_documents(addition_corpus_tf_idf) lsi_vector = self.lsi_model[addition_vector_tf_idf] return lsi_vector def save_lsi(self, name='/serialise/model.lsi'): self.lsi_model.save(name) def save_lda(self, name='/serialise/model.lda'): self.lda_model.save(name) @staticmethod def load_lsi(name='/tmp/model.lsi'): my_model = MyModel() my_model.lsi_model = models.LsiModel.load(name) return my_model
def get_topic_labels(corpus_path, n_topics, n_top_words, preprocessing_steps, n_cand_labels, label_min_df, label_tags, n_labels, lda_random_state, lda_n_iter): """ Refer the arguments to `create_parser` """ #gensim's topic modeling print("Loading docs...") docs = load_line_corpus(corpus_path) common_dictionary = Dictionary(docs) common_corpus = [common_dictionary.doc2bow(text) for text in docs] lda1 = ldamodel.LdaModel(common_corpus, num_topics=10, id2word=common_dictionary) lsi = LsiModel(corpus=common_corpus, id2word=common_dictionary, num_topics=400) lda1.print_topics(5) print("---------------------------------------") #lsi.print_topics(5) print(lsi.print_topic(0)) #print(common_corpus) exit() if 'wordlen' in preprocessing_steps: print("Word length filtering...") wl_filter = CorpusWordLengthFilter(minlen=3) docs = wl_filter.transform(docs) if 'stem' in preprocessing_steps: print("Stemming...") stemmer = CorpusStemmer() docs = stemmer.transform(docs) if 'tag' in preprocessing_steps: print("POS tagging...") tagger = CorpusPOSTagger() tagged_docs = tagger.transform(docs) tag_constraints = [] if label_tags != ['None']: for tags in label_tags: tag_constraints.append(tuple(map(lambda t: t.strip(), tags.split(',')))) if len(tag_constraints) == 0: tag_constraints = None print("Tag constraints: {}".format(tag_constraints)) print("Generate candidate bigram labels(with POS filtering)...") finder = BigramLabelFinder('pmi', min_freq=label_min_df, pos=tag_constraints) if tag_constraints: assert 'tag' in preprocessing_steps, \ 'If tag constraint is applied, pos tagging(tag) should be performed' cand_labels = finder.find(tagged_docs, top_n=n_cand_labels) else: # if no constraint, then use untagged docs cand_labels = finder.find(docs, top_n=n_cand_labels) print("Collected {} candidate labels".format(len(cand_labels))) print("Calculate the PMI scores...") pmi_cal = PMICalculator( doc2word_vectorizer=WordCountVectorizer( min_df=5, stop_words=load_lemur_stopwords()), doc2label_vectorizer=LabelCountVectorizer()) pmi_w2l = pmi_cal.from_texts(docs, cand_labels) print("Topic modeling using LDA...") model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter, random_state=lda_random_state) model.fit(pmi_cal.d2w_) print("\nTopical words:") print("-" * 20) for i, topic_dist in enumerate(model.topic_word_): top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1] topic_words = [pmi_cal.index2word_[id_] for id_ in top_word_ids] print('Topic {}: {}'.format(i, ' '.join(topic_words))) ranker = LabelRanker(apply_intra_topic_coverage=False) return ranker.top_k_labels(topic_models=model.topic_word_, pmi_w2l=pmi_w2l, index2label=pmi_cal.index2label_, label_models=None, k=n_labels)