Esempio n. 1
0
def lda(filename):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet"

    tweets = []
    with open(filename, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for tweet in reader:
            tweets.append(tweet['text'].split(' '))

    dictionary = Dictionary(tweets)
    corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

    for num_topics in [20, 30, 50]:
        lines = []
        model = LdaMallet(path_to_mallet_binary,
                          corpus=corpus,
                          num_topics=num_topics,
                          id2word=dictionary)
        with open('lda' + str(num_topics) + '.txt', 'w') as result:
            for topic in range(num_topics):
                for word in model.show_topic(topic, topn=20):
                    result.write(word[0] + ' ')
                result.write('\n')
Esempio n. 2
0
    def format_topics_sentences_mallet(self, ldamodel: LdaMallet, corpus,
                                       texts):
        # Init output
        sent_topics_df = pd.DataFrame()

        # Get main topic in each document
        for i, row in enumerate(ldamodel[corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = ldamodel.show_topic(topic_num, topn=8)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series(
                        [int(topic_num),
                         round(prop_topic, 4), topic_keywords]),
                                                           ignore_index=True)
                else:
                    break
        sent_topics_df.columns = [
            'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'
        ]

        # Add original text to the end of the output
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return (sent_topics_df)
Esempio n. 3
0
    def run_lda(self, processed_sentences):
        lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        # list containing the final topic keywords
        topic_top_words = []

        documents = [
            comment.split() for comment in processed_sentences if comment
        ]
        dictionary = corpora.Dictionary(documents)
        # Filter the words that occur in less than 5 comments or those that occur in more than half of the comments
        dictionary.filter_extremes(no_below=5, no_above=0.5)
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents]
        mallet_path = 'C:\\Mallet-2.0.8\\bin\\mallet'
        optimization_interval = 50
        lda_alpha = 1

        lda = LdaMallet(mallet_path,
                        doc_term_matrix,
                        num_topics=self.number_of_topics,
                        id2word=dictionary,
                        optimize_interval=optimization_interval,
                        alpha=lda_alpha)

        # This list contains the word probabilities given a topic
        topic_words_and_probs = []

        for i in range(self.number_of_topics):
            # Get top number_of_lda_keywords_for_assignment words and corresponding probabilities for the topic
            topic_words_and_probs.append(
                lda.show_topic(
                    i, topn=self.number_of_lda_keywords_for_assignment))

        for i in range(len(topic_words_and_probs)):
            temp = []
            for j in topic_words_and_probs[i]:
                if j[1] > 0.0:
                    temp.append(j)
                    self.total_topic_word.append(j[0])
            topic_words_and_probs[i] = temp

        for i in range(self.number_of_topics):
            # Get the top keywords for the topic and extract the top nouns
            topic_words = [
                component[0] for component in topic_words_and_probs[i]
            ]

            final_topic_words = []

            for word in topic_words:
                if len(final_topic_words) >= self.number_of_lda_keywords:
                    break

                pos = nltk.pos_tag([word])
                word = lemmatizer.lemmatize(word)
                noun_tags = ['NN', 'NNS', 'NP', 'NPS']
                if word not in final_topic_words and pos[0][1] in noun_tags:
                    final_topic_words.append(word)
            topic_top_words.append(final_topic_words)
        return topic_top_words, topic_words_and_probs
Esempio n. 4
0
def get_topics(num, corpus, id2word, output_dir, all_sentences):
    print(num)
    ldamallet = LdaMallet(args.mallet_dir,
                          corpus=corpus,
                          num_topics=num,
                          prefix=output_dir + "/" + str(num),
                          workers=4,
                          id2word=id2word,
                          iterations=1000,
                          random_seed=42)
    coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                               texts=all_sentences,
                                               dictionary=id2word,
                                               coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    print('\nCoherence Score: ', coherence_ldamallet)
    keywords = {i: ", ".join([word for word, prop in ldamallet.show_topic(i)]) for i in range(ldamallet.num_topics)}
    with open(output_dir + "/" + str(num) + '_words.json', 'w') as f:
        f.write(json.dumps(keywords))
    ldamallet.save(output_dir + "/" + str(num))
    #ldamallet.show_topics(num_topics=num, formatted=True)
    return coherence_ldamallet