Ejemplo n.º 1
0
def lda_move_topics(dictionary, corpus, texts, limit, start=2, step=1):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    perplexity_values : Perplexity values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    perplexity_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus,
                             num_topics=num_topics,
                             id2word=dictionary,
                             passes=2,
                             workers=2)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        perplexity_values.append(model.log_perplexity(corpus))

    return model_list, coherence_values, perplexity_values
Ejemplo n.º 2
0
def models_codherence_perplexity(texts,
                                 bows,
                                 dic,
                                 topic_start=100,
                                 topic_end=201,
                                 step=10,
                                 chunk=10,
                                 passes=3,
                                 cores=2):
    """ Build models on a range of number of topics to compare quality.
        The output is 3 lists of:
            1. List of built models
            2. List of coherence scores calculated on texts
            3. List of perplexity scores calculated on bows
        --------------------
        Parameter:
            texts: list of list of tokens
            bows: list of list of BoWs
            dic: dictionary of id <-> word
            topic_start, topic_end, step: range of number of topics
            chunk: number of data used in each training step
            passes: number of passes through the whole training data
            cores: number of cores use for parallel training

        Return:
            models, coherence_scores, perplexity_scores
    """
    models = []
    coherence_scores = []
    perplexity_scores = []
    for num_topics in range(topic_start, topic_end, step):
        print('Building model of %d topics' % (num_topics))
        # Build topic model for the given number of topics:
        model = LdaMulticore(corpus=bows,
                             id2word=dic,
                             eta='auto',
                             num_topics=num_topics,
                             chunksize=chunk,
                             passes=passes,
                             workers=cores)
        # Build coherence model to test the topic model:
        coherence_model = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=dic,
                                         coherence='c_v')
        # Save the results:
        models.append(model)
        coherence_scores.append(coherence_model.get_coherence())
        perplexity_scores.append(model.log_perplexity(bows))
    return models, coherence_scores, perplexity_scores
Ejemplo n.º 3
0
    def check_perplexity(self, model: LdaMulticore,
                         valid_corpus: list) -> bool:
        """
        perwordbound から perplexityを計算してログに追加
        - perplexity: 要はエントロピー。詳しくは書籍:統計的学習の基礎(厚み10cm!)が説明が端折られず分かりやすい
        - perwordbound: 
          - gensim標準のperplexity計算メソッドlog_perplexityの戻り値
          - the variational bound of documents from the corpus as E_q[log p(corpus)] - E_q[log q(corpus)]
          - 推定分布と真の分布の差...迷いの大きさに-1をかけた値と思っておけばOK

        :param model: LdaMulticore gensimのLDAモデル
        :param valid_corpus: list{iteratable of (int, int or float)}, コーパス
        :return:
        """
        perwordbound = model.log_perplexity(valid_corpus)
        perplexity = np.exp2(-perwordbound)
        self.log.append(perplexity)

        return True
def main():
    args = parse_args()
    print('* Loading data from ', args.input)
    data_df = pd.read_csv(args.input, header=0, index_col=0, sep='\t')
    samples_bow = counts_to_bow(data_df.values)

    samples_train, samples_test = train_test_split(samples_bow, test_size=0.1)

    print('* Training LDA models')
    print('\t - # folds', args.n_folds)
    print('\t - # Ranks', args.rank_range)

    print('* Perplexities: ')
    kf = KFold(n_splits=args.n_folds)
    perps = []
    Ks = []
    models = defaultdict(list)
    for train_index, test_index in kf.split(samples_bow):
        samples_train = [samples_bow[i] for i in train_index]
        samples_test = [samples_bow[i] for i in test_index]
        for K in args.rank_range:
            lda = LdaMulticore(
                    samples_train,
                    num_topics=K,
                    workers=args.cores)
            perp = lda.log_perplexity(samples_test)
            print('\t {}: {}'.format(K, perp))
            Ks.append(K)
            perps.append(perp)
            models[K].append(lda)

    print(perps, Ks)

    perp_df = pd.DataFrame({'log_perplexity': perps, 'num_topics': Ks})
    ax = sns.boxplot(x='num_topics', y='log_perplexity', data=perp_df)
    plot = ax.get_figure()
    plot.savefig(args.output_file)
# GENSIM
start = time.time()
lda_gensim_mc = LdaMulticore(gensim_tr_corpus,
                             id2word=id2word,
                             decay=decay,
                             offset=offset,
                             num_topics=NB_TOPICS,
                             passes=max_iterations,
                             batch=False,
                             chunksize=batch_size,
                             iterations=max_e_steps,
                             eval_every=eval_every)
gn_time = time.time() - start

log_prep_gensim_mc = lda_gensim_mc.log_perplexity(gensim_te_corpus)
preplexity_gensim_mc = np.exp(-1. * log_prep_gensim_mc)

print("gensim run time and perplexity: {}, {}".format(gn_time,
                                                      preplexity_gensim_mc))
print("sklearn run time and perplexity: {}, {}".format(sk_time,
                                                       sklearn_perplexity))

# Lets have a look to the topics
topic_words = dict()
gensim_topics = lda_gensim_mc.show_topics(formatted=False)


def sklearn_show_topics(model, feature_names, n_top_words):
    sk_topics = []
    for topic_idx, topic in enumerate(model.components_):
Ejemplo n.º 6
0
def fit_tm_gensim(
    corpus: 'gensim.corpus' = None,
    dictionary: 'Dictionary' = None,
    text: list = None,
    range_topics: list = None,
    passes: int = 10,
    per_word_topics: bool = True,
) -> (list, list, list):
    """
    fit topic modeling model gensim, multicore (using LdaMulticore)
    
    Parameters
    ----------
    corpus : 'gensim.corpus' aka {iterable of list of (int, float), scipy.sparse.csc}
        Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`)
    dictionary: 'Dictionary' aka {dict of (int, str),  :class:`gensim.corpora.dictionary.Dictionary`}
        Mapping from word IDs to words. It is used to determine the vocabulary size,
        as well as for debugging and topic printing.
    text : list
        list text tokens like text at `corpora.Dictionary`
    range_topics: list
        list range topics, like `[20, 25, 30, 35, 40]`
    passes: int (default = 10)
        number of passes through the corpus during training
    per_word_topics : bool (default = True)
        if True, the model also computes a list of topics, sorted in descending order of most likely
        topics for each word, along with their phi values multiplied by the feature length (i.e. word count)
    Returns
    -------
    meta_model : dict
        meta model fitted model
        with next items: model, coherence, perplexity, time
    """

    meta_model = {}

    for num_topic in range_topics:
        print(f'#topic {num_topic} ..........')
        time_start = datetime.now()

        # fit models TM
        model_gensim = LdaMulticore(
            corpus=corpus, id2word=dictionary, num_topics=num_topic, passes=passes, per_word_topics=True
        )

        # evaluation
        # coherence
        coherence_model = CoherenceModel(model=model_gensim, texts=text, dictionary=dictionary, coherence='c_v')
        coherence_value = coherence_model.get_coherence()
        print(f'\tcoherence score: {coherence_value}')

        # perplexity
        perplexity_value = model_gensim.log_perplexity(corpus)
        print(f'\tperplexity score: {perplexity_value}')

        # time
        time_end = datetime.now() - time_start
        print(f'\n\ttime: {time_end}')

        sec = time_end.seconds
        hours = sec // 3600
        minutes = (sec // 60) - (hours * 60)

        meta_model[num_topic] = {
            'model': model_gensim,
            'coherence': coherence_value,
            'perplexity': perplexity_value,
            'time_fit': {'hours': hours, 'min': minutes, 'sec': sec},
        }

    return meta_model
Ejemplo n.º 7
0
                            alpha='asymmetric',
                            eval_every=eval_every,
                            workers=3,
                            random_state=seed)

    # Check resulting topics.
    listOfTopics = ldaModel.print_topics(num_topics=numberOfTopics,
                                         num_words=15)
    for index, i in enumerate(listOfTopics):
        string = str(i[1])
        for c in "0123456789+*\".":
            string = string.replace(c, "")
        string = string.replace("  ", " ")
        print(string)
    # calculate & display perplexity
    print('\nPerplexity: ', ldaModel.log_perplexity(
        corpus))  # a measure of how good the model is. lower the better.

    # calculate & display coherence
    coherenceModel = CoherenceModel(model=ldaModel,
                                    texts=document,
                                    dictionary=dictionary,
                                    coherence='c_v')
    ldaCoherence = coherenceModel.get_coherence()
    print('\nCoherence Score: ', ldaCoherence)

    # assign a file name based on the loop number so that models aren't overridden during successive iterations.
    path = './models/both/nouns_only'
    if not os.path.exists(path):
        os.makedirs(path)
    ldaModel.save(f'./models/both/nouns_only/model1-{loopNum}.model')
class GensimMalletTopicExtractor:
    def __init__(self, language='english', stopwords_extent=None):
        self.language2la = {
            'english': 'en',
            'french': 'fr',
            'spanish': 'es'
        }
        if language not in self.language2la:
            raise ValueError('Language must be "english", "french" or "spanish"')
        self.language = language
        self.stop_words = stopwords.words(self.language)
        if stopwords_extent is str or stopwords_extent is list:
            self.stop_words.extend(stopwords_extent)
        self.df_topic_sents_keywords = None
        self.bigram = None
        self.bigram_phraser = None
        self.trigram = None
        self.trigram_phraser = None
        self.vis = None
        self.data = None
        self.data_words = None
        self.data_words_nostops = None
        self.data_words_bigrams = None
        self.data_words_trigrams = None
        self.nlp = None
        self.data_lemmatized = None
        self.id2word = None
        self.texts = None
        self.corpus = None
        self.mallet_path = None
        self.lda_model = None
        self.coherence_model_lda = None
        self.coherence_lda = None
        self.coherence_values = []
        self.model_list = []
        self.optimal_number_of_topics = None
        self.optimal_model = None
        self.optimal_topics = None

    @staticmethod
    def sent_to_words(sentences, remove_punctuation=True):
        for sentence in sentences:
            # deacc=True removes punctuations
            yield(simple_preprocess(str(sentence), deacc=remove_punctuation))

    def remove_stopwords(self, texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in self.stop_words] for doc in texts]

    def make_bigrams(self, texts):
        self.bigram = Phrases(self.data_words, min_count=5, threshold=100)
        self.bigram_phraser = Phraser(self.bigram)
        return [self.bigram_phraser[doc] for doc in texts]

    def make_trigrams(self, texts):
        tokens_ = self.bigram_phraser[texts]
        self.trigram = Phrases(tokens_, threshold=100)
        self.trigram_phraser = Phraser(self.trigram)
        return [self.trigram_phraser[self.bigram_phraser[doc]] for doc in texts]

    def lemmatization(self, texts, allowed_postags=None):
        if allowed_postags is None:
            allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
        """https://spacy.io/api/annotation"""
        texts_out = []
        for sent in texts:
            doc = self.nlp(" ".join(sent))
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out

    def view_terms_frequency(self, text_id, first_words=20):
        # Human readable format of corpus (term-frequency)
        list_ = [[(self.id2word[id_], freq) for id_, freq in text[:first_words]] for text in self.corpus[text_id]]
        pprint(list_)

    def visualize_lda(self):
        # Visualize the topics
        # pyLDAvis.enable_notebook()
        self.vis = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
        print(self.vis)

    def instanciate_model(self, num_topics, passes, iterations,
                          enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False):
        if enable_mallet is True:
            # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
            os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'})
            self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet'  # update this path
            self.lda_model = LdaMallet(self.mallet_path,
                                       corpus=self.corpus,
                                       num_topics=num_topics,
                                       id2word=self.id2word,
                                       iterations=iterations,
                                       optimize_interval=optimize_interval,
                                       topic_threshold=topic_threshold)
            print('Mallet LDA model built\n')
            if show_topics_on_creation is True:
                pprint(self.lda_model.show_topics(formatted=False))
        else:
            self.lda_model = LdaMulticore(corpus=self.corpus,
                                          id2word=self.id2word,
                                          num_topics=num_topics,
                                          random_state=100,
                                          chunksize=500,
                                          passes=passes,
                                          iterations=iterations,
                                          per_word_topics=True)
            print('LDA_MultiCore model built\n')
            if show_topics_on_creation is True:
                pprint(self.lda_model.print_topics())

    def extract_topics(self, data, num_topics, passes=10, iterations=500,
                       enable_mallet=True, optimize_interval=0,
                       topic_threshold=0.0):
        self.data = data
        print('\nEXTRACTING ' + str(num_topics) + ' TOPICS')
        self.data_words = list(self.sent_to_words(self.data, True))
        # Remove Stop Words
        print('\nRemoving stopwords')
        self.data_words_nostops = self.remove_stopwords(self.data_words)
        # Form Bigrams
        print('Looking for bigrams')
        self.data_words_bigrams = self.make_bigrams(self.data_words_nostops)
        # Form Trigrams
        print('Looking for trigrams')
        self.data_words_trigrams = self.make_trigrams(self.data_words_nostops)
        # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
        # python3 -m spacy download en
        print('Loading Spacy with ' + self.language + ' dictionary')
        self.nlp = spacy.load(self.language2la[self.language], disable=['parser', 'ner'])
        # Do lemmatization keeping only noun, adj, vb, adv
        print('Lemmatizing')
        self.data_lemmatized = self.lemmatization(self.data_words_trigrams,
                                                  allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        # Create Dictionary
        print('Creating dictionary')
        self.id2word = corpora.Dictionary(self.data_lemmatized)
        # Create Corpus
        print('Creating corpus')
        self.texts = self.data_lemmatized
        # Term Document Frequency
        print('Computing document frequency')
        self.corpus = [self.id2word.doc2bow(text) for text in self.texts]
        # Build LDA model
        print('\nEnable_mallet is', enable_mallet, '\n')
        self.instanciate_model(num_topics, passes, iterations,
                               enable_mallet, optimize_interval, topic_threshold,
                               show_topics_on_creation=True)
        # print(self.lda_model[self.corpus])
        # Compute Perplexity
        # a measure of how good the model is. lower the better.
        if hasattr(self.lda_model, 'log_perplexity'):
            print('\nPerplexity: ', self.lda_model.log_perplexity(self.corpus))

        # Compute Coherence Score
        print('\nComputing coherence model')
        self.coherence_model_lda = CoherenceModel(model=self.lda_model,
                                                  texts=self.data_lemmatized,
                                                  dictionary=self.id2word,
                                                  coherence='c_v')
        print('Getting coherence')
        self.coherence_lda = self.coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', self.coherence_lda)

        if enable_mallet is False:
            self.visualize_lda()

    def view_optimal_topics(self, num_words=20):
        pprint(self.optimal_model.print_topics(num_words=num_words))

    def compute_coherence_values(self, limit, start=2, step=3, passes=10,
                                 iterations=500, enable_mallet=True,
                                 optimize_interval=0, topic_threshold=0.0):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        limit : Max num of topics

        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        for num_topics in range(start, limit, step):
            print('\n' + '*'*10 + ' COMPUTING COHERENCE FOR ' + str(num_topics) + ' TOPICS ' + '*'*10)
            self.instanciate_model(num_topics, passes, iterations,
                                   enable_mallet, optimize_interval, topic_threshold,
                                   show_topics_on_creation=False)
            self.model_list.append(self.lda_model)
            coherence_model = CoherenceModel(model=self.lda_model,
                                             texts=self.data_lemmatized,
                                             dictionary=self.id2word,
                                             coherence='c_v')
            self.coherence_values.append(coherence_model.get_coherence())

        # Show graph
        x = range(start, limit, step)
        plt.plot(x, self.coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend("coherence_values", loc='best')
        plt.show()

        # Print the coherence scores
        for m, cv in zip(x, self.coherence_values):
            print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

        optimal_model_index = self.coherence_values.index(max(self.coherence_values))
        self.optimal_number_of_topics = start + optimal_model_index
        self.optimal_model = self.model_list[optimal_model_index]
        print('\nOptimal number of topics is ' + str(self.optimal_number_of_topics) +
              ' with coherence score : ' + str(self.coherence_values[optimal_model_index]))
        self.optimal_topics = self.optimal_model.show_topics(num_topics=self.optimal_number_of_topics,
                                                             num_words=20, formatted=False)
        self.view_optimal_topics()

    def format_topics_sentences(self, ldamodel=None):
        if ldamodel is None and self.optimal_model is not None:
            ldamodel = self.optimal_model
        elif ldamodel is None and self.lda_model is not None:
            ldamodel = self.lda_model
        # Init output
        sent_topics_df = pd.DataFrame()

        # Get main topic in each document
        for i, row in enumerate(ldamodel[self.corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num),
                                                                      round(prop_topic, 4),
                                                                      topic_keywords]),
                                                           ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

        # Add original text to the end of the output
        contents = pd.Series(self.data)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return sent_topics_df

    def get_most_representative_documents(self):
        # Group top 5 sentences under each topic
        sent_topics_sorteddf_mallet = pd.DataFrame()

        if self.df_topic_sents_keywords is None:
            self.df_topic_sents_keywords = self.format_topics_sentences()
        # Format
        df_dominant_topic = self.df_topic_sents_keywords.reset_index()
        df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
        sent_topics_outdf_grpd = self.df_topic_sents_keywords.groupby('Dominant_Topic')

        for i, grp in sent_topics_outdf_grpd:
            sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                                     grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
                                                    axis=0)

        # Reset Index
        sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
        # Format
        sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
        # Show
        sent_topics_sorteddf_mallet.head()

        for i in range(len(sent_topics_sorteddf_mallet)):
            print(i, sent_topics_sorteddf_mallet.loc[i, 'Text'])

    def get_topic_distribution(self):
        if self.df_topic_sents_keywords is None:
            self.df_topic_sents_keywords = self.format_topics_sentences()
        # Number of Documents for Each Topic
        topic_counts = self.df_topic_sents_keywords['Dominant_Topic'].value_counts()
        # Percentage of Documents for Each Topic
        topic_contribution = round(topic_counts/topic_counts.sum(), 4)
        # Topic Number and Keywords
        topic_num_keywords = self.df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]
        # Concatenate Column wise
        df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
        # Change Column names
        df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
        # Show
        print(df_dominant_topics)
    test_corpus = Sparse2Corpus(test_dtm, documents_columns=False)
    timing = []
    for workers in [8, 16]:
        for num_topics in [10, 50]:
            print('start', workers, num_topics, end=' ')
            start = time()
            lda = LdaMulticore(corpus=train_corpus,
                               num_topics=num_topics,
                               id2word=id2word,
                               chunksize=1000,
                               passes=1,
                               eval_every=None,
                               workers=workers,
                               random_state=42)
            duration = time() - start
            test_perplexity = 2**(-lda.log_perplexity(test_corpus))
            timing.append([workers, num_topics, duration, test_perplexity])
            print(format_time(duration), test_perplexity)
            pd.DataFrame(timing,
                         columns=[
                             'workers', 'num_topics', 'duration',
                             'test_perplexity'
                         ]).to_csv(f'timings_{workers}.csv', index=False)
    exit()

    test_vocab = test_dtm.count_nonzero()
    perplexity, coherence = [], []
    for num_topics, passes in model_params:
        model_path = vocab_path / str(num_topics) / str(passes)
        if not model_path.exists():
            model_path.mkdir(exist_ok=True, parents=True)