def train_lda(): """ Train the LDA model. generate_dictionary() must be called before this method. """ print("------------------") print("Training LDA model") print("------------------") # load dictionary, as generated by generate_dictionary() print("Loading dictionary...") dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH) # generate a mapping from word id to word print("Generating id2word...") id2word = {} for word in dictionary.token2id: id2word[dictionary.token2id[word]] = word # initialize LDA print("Initializing LDA...") lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word, workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE) # Train the LDA model print("Training...") examples = [] update_every_n_windows = 25000 windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE, only_labeled_windows=True) for i, window in enumerate(windows): tokens_str = [token.word.lower() for token in window.tokens] bow = dictionary.doc2bow(tokens_str) # each window as bag of words examples.append(bow) if len(examples) >= update_every_n_windows: print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA)) # this is where the LDA model is trained lda_model.update(examples) examples = [] if i >= COUNT_EXAMPLES_FOR_LDA: print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,)) break # i don't update here with the remainder of windows, because im not sure if each update step's # results are heavily influenced/skewed by the the number of examples #if len(examples) > 0: # print("Updating with remaining windows...") # lda_model.update(examples) # save trained model to HDD print("Saving...") lda_model.save(cfg.LDA_MODEL_FILEPATH)
class Lda(BaseEstimator, TransformerMixin): def __init__(self, id2word=None, num_topics=25, passes=1): self.lda = None self.id2word = id2word self.num_topics = num_topics self.passes = passes def fit(self, X, y=None): """ Parameter --------- X : [sp.csr_matrix] Returns ------- self """ if self.lda is None: self.lda = LdaMulticore(id2word=self.id2word, num_topics=self.num_topics, passes=self.passes) X_flat = sp.vstack(X) self.lda.update(Sparse2Corpus(X_flat, documents_columns=False)) return self def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def transform(self, X): """ Parameter --------- X : [sp.csr_matrix] Returns ------- topic_vectors : [np.ndarray] each matrix is of shape (sent_count, topic_count) """ topic_vectors = [] for doc in X: sents_bow = Sparse2Corpus(doc, documents_columns=False) gamma, _ = self.lda.inference(sents_bow) # divide row by row sum topic_dist = (gamma.T / np.sum(gamma, axis=1)).T topic_vectors.append(topic_dist) return topic_vectors
class Lda(BaseEstimator, TransformerMixin): def __init__(self, id2word=None, num_topics=25, passes=1): self.lda = None self.id2word = id2word self.num_topics = num_topics self.passes = passes def fit(self, X, y=None): """ Parameter --------- X : [sp.csr_matrix] Returns ------- self """ if self.lda is None: self.lda = LdaMulticore( id2word=self.id2word, num_topics=self.num_topics, passes=self.passes) X_flat = sp.vstack(X) self.lda.update(Sparse2Corpus(X_flat, documents_columns=False)) return self def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def transform(self, X): """ Parameter --------- X : [sp.csr_matrix] Returns ------- topic_vectors : [np.ndarray] each matrix is of shape (sent_count, topic_count) """ topic_vectors = [] for doc in X: sents_bow = Sparse2Corpus(doc, documents_columns=False) gamma, _ = self.lda.inference(sents_bow) # divide row by row sum topic_dist = (gamma.T / np.sum(gamma, axis=1)).T topic_vectors.append(topic_dist) return topic_vectors