Esempio n. 1
0
def train_lda():
    """
    Train the LDA model.
    generate_dictionary() must be called before this method.
    """
    print("------------------")
    print("Training LDA model")
    print("------------------")

    # load dictionary, as generated by generate_dictionary()
    print("Loading dictionary...")
    dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH)

    # generate a mapping from word id to word
    print("Generating id2word...")
    id2word = {}
    for word in dictionary.token2id:
        id2word[dictionary.token2id[word]] = word

    # initialize LDA
    print("Initializing LDA...")
    lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word,
                             workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE)

    # Train the LDA model
    print("Training...")
    examples = []
    update_every_n_windows = 25000
    windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE,
                           only_labeled_windows=True)
    for i, window in enumerate(windows):
        tokens_str = [token.word.lower() for token in window.tokens]
        bow = dictionary.doc2bow(tokens_str) # each window as bag of words
        examples.append(bow)
        if len(examples) >= update_every_n_windows:
            print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA))
            # this is where the LDA model is trained
            lda_model.update(examples)
            examples = []
        if i >= COUNT_EXAMPLES_FOR_LDA:
            print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,))
            break

    # i don't update here with the remainder of windows, because im not sure if each update step's
    # results are heavily influenced/skewed by the the number of examples
    #if len(examples) > 0:
    #    print("Updating with remaining windows...")
    #    lda_model.update(examples)

    # save trained model to HDD
    print("Saving...")
    lda_model.save(cfg.LDA_MODEL_FILEPATH)
Esempio n. 2
0
class Lda(BaseEstimator, TransformerMixin):
    def __init__(self, id2word=None, num_topics=25, passes=1):
        self.lda = None
        self.id2word = id2word
        self.num_topics = num_topics
        self.passes = passes

    def fit(self, X, y=None):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        self
        """
        if self.lda is None:
            self.lda = LdaMulticore(id2word=self.id2word,
                                    num_topics=self.num_topics,
                                    passes=self.passes)
        X_flat = sp.vstack(X)
        self.lda.update(Sparse2Corpus(X_flat, documents_columns=False))
        return self

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def transform(self, X):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        topic_vectors : [np.ndarray]
            each matrix is of shape (sent_count, topic_count)
        """
        topic_vectors = []
        for doc in X:
            sents_bow = Sparse2Corpus(doc, documents_columns=False)
            gamma, _ = self.lda.inference(sents_bow)
            # divide row by row sum
            topic_dist = (gamma.T / np.sum(gamma, axis=1)).T
            topic_vectors.append(topic_dist)
        return topic_vectors
Esempio n. 3
0
class Lda(BaseEstimator, TransformerMixin):
    def __init__(self, id2word=None, num_topics=25, passes=1):
        self.lda = None
        self.id2word = id2word
        self.num_topics = num_topics
        self.passes = passes

    def fit(self, X, y=None):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        self
        """
        if self.lda is None:
            self.lda = LdaMulticore(
                    id2word=self.id2word, num_topics=self.num_topics, passes=self.passes)
        X_flat = sp.vstack(X)
        self.lda.update(Sparse2Corpus(X_flat, documents_columns=False))
        return self

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def transform(self, X):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        topic_vectors : [np.ndarray]
            each matrix is of shape (sent_count, topic_count)
        """
        topic_vectors = []
        for doc in X:
            sents_bow = Sparse2Corpus(doc, documents_columns=False)
            gamma, _ = self.lda.inference(sents_bow)
            # divide row by row sum
            topic_dist = (gamma.T / np.sum(gamma, axis=1)).T
            topic_vectors.append(topic_dist)
        return topic_vectors