def topic_model(self):
        """
        Calculate the topic model for all the units, the probability that the comment has each of the topics
        :return: pandas DF[number_of_units, number_of_topics] - the probability for each comment and topic
        """
        print('{}: Start topic model'.format(
            (time.asctime(time.localtime(time.time())))))
        # Clean the data
        print('{}: Clean the data'.format(
            (time.asctime(time.localtime(time.time())))))

        data_clean = {
            row['submission_id']: clean(row['submission_body']).split()
            for index, row in self.embedded_submission_text.iterrows()
        }

        # Creating the term dictionary of our corpus, where every unique term is assigned an index.
        print('{}: Create the dictionary'.format(
            time.asctime(time.localtime(time.time()))))
        dictionary = gensim.corpora.Dictionary(data_clean.values())

        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        print('{}: Create data term matrix'.format(
            time.asctime(time.localtime(time.time()))))
        data_term_matrix = {
            index: dictionary.doc2bow(doc)
            for index, doc in data_clean.items()
        }

        # Get topics for the data
        print('{}: Predict topics'.format(
            time.asctime(time.localtime(time.time()))))

        lda_model = ldamodel.LdaTransformer(num_topics=self.number_of_topics,
                                            id2word=dictionary,
                                            passes=50,
                                            minimum_probability=0)
        result = lda_model.transform(list(data_term_matrix.values()))

        print('{}: Create final topic model'.format(
            time.asctime(time.localtime(time.time()))))
        comment_ids_df = pd.DataFrame(list(data_term_matrix.keys()),
                                      columns=['submission_id'])
        result_columns = [i for i in range(self.number_of_topics)]
        topic_model_result_df = pd.DataFrame(result, columns=result_columns)

        print('{}: Save final topic model'.format(
            time.asctime(time.localtime(time.time()))))
        topic_model_final_result = pd.concat(
            [comment_ids_df, topic_model_result_df], axis=1)
        print('{}: Finish topic model'.format(
            (time.asctime(time.localtime(time.time())))))

        return topic_model_final_result
Exemple #2
0
    def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics

        To use Latent Semantic Analysis, set estimator to 'LSA'
        otherwise defaults to Latent Dirichlet Allocation.
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = lsimodel.LsiTransformer(num_topics=self.n_topics)
        else:
            self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics)

        self.model = Pipeline([('norm', TextNormalizer()),
                               ('vect', GensimTfidfVectorizer()),
                               ('model', self.estimator)])
Exemple #3
0
 def __init__(self,
              n_topics=100,
              update_every=0,
              passes=1,
              alpha="auto",
              scorer="perplexity",
              include_bigram=False,
              bigram_path="",
              stopwords=[]):
     self.n_topics = n_topics
     self.model = Pipeline([
         ('norm',
          TextNormalizer(include_bigram=include_bigram,
                         bigram_path=bigram_path,
                         stopwords=stopwords)),
         ('vect', GensimOneHotVectorizer()),
         ('model',
          ldamodel.LdaTransformer(num_topics=self.n_topics,
                                  update_every=update_every,
                                  passes=passes,
                                  alpha=alpha,
                                  scorer=scorer))
     ])
     self.search = None
    def topic_model(self):
        # Clean the data
        print('{}: Clean the data'.format(
            (time.asctime(time.localtime(time.time())))))
        units_clean = {
            row['comment_id']: self.clean(row['comment_body']).split()
            for index, row in self.units.iterrows()
        }
        all_data_clean = {
            row['comment_id']: self.clean(row['comment_body']).split()
            for index, row in self.all_data.iterrows()
        }
        # Creating the term dictionary of our corpus, where every unique term is assigned an index.
        print('{}: Create the dictionary'.format(
            (time.asctime(time.localtime(time.time())))))
        dictionary = corpora.Dictionary(all_data_clean.values())

        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        print('{}: Create units term matrix'.format(
            (time.asctime(time.localtime(time.time())))))
        units_term_matrix = {
            index: dictionary.doc2bow(doc)
            for index, doc in units_clean.items()
        }
        print('{}: Create all data term matrix'.format(
            (time.asctime(time.localtime(time.time())))))
        all_data_term_matrix = {
            index: dictionary.doc2bow(doc)
            for index, doc in all_data_clean.items()
        }

        # Create LDA model
        print('{}: Create model'.format(
            (time.asctime(time.localtime(time.time())))))
        model = ldamodel.LdaTransformer(num_topics=self.number_of_topics,
                                        id2word=dictionary,
                                        passes=50,
                                        minimum_probability=0)
        # Train LDA model on the comments term matrix.
        print('{}: Fit the model on all data'.format(
            (time.asctime(time.localtime(time.time())))))
        model = model.fit(list(all_data_term_matrix.values()))
        # Get topics for the data
        print('{}: Predict topics for units'.format(
            (time.asctime(time.localtime(time.time())))))
        result = model.transform(list(units_term_matrix.values()))

        print('{}: Create final topic model data'.format(
            (time.asctime(time.localtime(time.time())))))
        comment_ids_df = pd.DataFrame(list(units_term_matrix.keys()),
                                      columns=['comment_id'])
        result_columns = [
            'topic_model_' + str(i) for i in range(self.number_of_topics)
        ]
        topic_model_result_df = pd.DataFrame(result, columns=result_columns)

        print('{}: Save final topic model data'.format(
            (time.asctime(time.localtime(time.time())))))
        topic_model_final_result = pd.concat(
            [comment_ids_df, topic_model_result_df], axis=1)
        topic_model_final_result.to_csv(
            os.path.join(data_directory, 'topic_model_CMV.csv'))
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec

        return list(generator())


if __name__ == '__main__':
    from reader import PickledCorpusReader

    corpus = PickledCorpusReader('../corpus')
    docs = [
        list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids()
    ]

    model = Pipeline([('norm', TextNormalizer()),
                      ('vect', GensimTfidfVectorizer()),
                      ('lda', ldamodel.LdaTransformer())])

    model.fit_transform(docs)

    print(model.named_steps['norm'])