Exemple #1
0
    def train(self):
        split_archives = [article.tokens for article in self.articles]

        # create dictionary and corpus
        dictionary = corpora.Dictionary(split_archives)
        dictionary.filter_extremes(no_above=self.words_no_above)
        corpus = [dictionary.doc2bow(article) for article in split_archives]
        logger.info('Created dictionary and corpus')

        # get eta to force topics
        eta = get_eta(self.num_topics, dictionary)

        # create lda model with gensim
        lda_progress = LDAProgress(self.passes)
        ldamodel = LdaMulticore(corpus,
                                num_topics=self.num_topics,
                                id2word=dictionary,
                                passes=self.passes,
                                per_word_topics=True,
                                iterations=self.iterations,
                                eta=eta,
                                workers=cpu_count())
        lda_progress.close()

        logger.info('Created Topics model')

        # print the topics (debug)
        logger.debug('Topics:')
        topics = ldamodel.print_topics(num_words=5)
        for topic in topics:
            logger.debug(topic)
        self.model = ldamodel
        self.dictionary = dictionary
def createlda(num_topics, filename):
    dumppick(filename)
    num_topics = 50
    texts, texts_tf_idf, dictionary = loadpcik()
    # 利用lsi做主题分类的情况
    """
    print("**************LSI*************")
    lsi = models.lsimodel.LsiModel(corpus=texts, id2word=dictionary, num_topics=20)    # 初始化一个LSI转换
    texts_lsi = lsi[texts_tf_idf]                # 对其在向量空间进行转换
    print(lsi.print_topics(num_topics=20, num_words=10))
    """
    # 利用LDA做主题分类的情况
    print("**************LDA*************")
    #ppl = []
    #for i in range(1,50,1):
    #texts = shuffle(texts)
    #texts_train = texts[:int(24012*(0.9))]
    #texts_vad = texts[int(24012*(0.9)):]
    lda = LdaMulticore(corpus=texts,
                       iterations=1000,
                       id2word=dictionary,
                       num_topics=num_topics,
                       passes=200,
                       per_word_topics=True)
    #texts_lda = lda[texts_tf_idf]
    out = open("./ldamd/{}tpc-tpc".format(num_topics),
               mode="w",
               encoding="utf8")
    print(lda.print_topics(num_topics=num_topics, num_words=10), file=out)
    lda.save("./ldamd/{}tpc+{}".format(num_topics, filename[9:18]))
    #ppl.append(np.exp2(-lda.log_perplexity(texts_vad))/i)
    return lda, texts, texts_tf_idf, dictionary
class LdaProcessor(object):
    def __init__(self, token_docs, **filter_extremes_args):
        """
        token_docs : a list of lists of word or n-gram or sentence tokens.
            Eg, [['the','crazy','cat'],['that','doggone','dog']]
        """
        self.token_docs = token_docs
        self.id2word = corpora.Dictionary(token_docs)
        if filter_extremes_args:
            print 'filtering words with extreme frequencies'
            self.id2word.filter_extremes(**filter_extremes_args)
        # initialize the bow_corpus
        self.reset_bow_corpus(token_docs)

        print 'Got %i total tokens (words)' % len(self.id2word)

    def reset_bow_corpus(self, documents):
        """set or reset the corpus with the given documents"""
        self.bow_corpus = [self.id2word.doc2bow(doc) for doc in documents]
        return None

    def train_lda(self, num_topics, **kwargs):
        print 'training LDA...'
        self.lda = LdaMulticore(self.bow_corpus, id2word=self.id2word, num_topics=num_topics, **kwargs)
        return self

    def word_topics(self, num_words=10):
        return [topic[1] for topic in self.lda.print_topics(num_topics=self.lda.num_topics, num_words=num_words)]

    # utility functions
    def significant_topic_terms(self, topicid):
        raise NotImplementedError()
Exemple #4
0
def generate_tags(tokens: list) -> list:
    """Perform LDA Topic Modelling to aquire tags.

    Args:
        tokens (list): List of tokens

    Returns:
        tags_list (list) List of appropriate tags for
        given tokens.
    """
    id2word = Dictionary(tokens)
    corpus = [id2word.doc2bow(d) for d in tokens]
    model = LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        random_state=42,
        num_topics=10,
        passes=2,
        workers=1
    )
    words = [re.findall(r'"([^"]*)"', t[1]) for t in model.print_topics()]
    wordcount = Counter(words[0] + words[1] + words[2] + words[3] + words[4])
    tags = pd.DataFrame.from_dict(
        wordcount, orient='index', columns=['number']
    )
    tags = tags.drop(tags[tags['number'] <= 1].index)
    tags = tags.sort_values(by=['number'], ascending=False).T
    tags_list = [word for word in tags.columns]
    return tags_list
Exemple #5
0
 def print_terms(self, model: LdaMulticore):
     topics = []
     for topic in model.print_topics(num_topics=self.n_topics,
                                     num_words=10):
         topics.append([(s.split('*\"')[1].split('\"')[0],
                         float(s.split('*\"')[0]))
                        for s in str(topic[1]).split('+ ')])
     pprint(topics)
Exemple #6
0
class LdaMaker:
    def __init__(self, corpora, num_topics, print_topics=True):
        self.num_topics = num_topics

        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()
        self.stemmer = nltk.stem.snowball.RussianStemmer()

        corpora_tokenzied = [
            self.tokenizer.tokenize(
                (self._keep_only_russian_chars(str(doc).lower())))
            for doc in corpora
        ]

        corpora_stemmed = []
        for doc in corpora_tokenzied:
            stemmed_doc = [
                self.stemmer.stem(token) for token in doc
                if token not in ru_stopwords
            ]
            stemmed_doc = [
                token for token in stemmed_doc if token not in ru_stopwords
            ]
            corpora_stemmed.append(stemmed_doc)

        self.dictionary = gensim.corpora.Dictionary(corpora_stemmed)
        corpora_bow = [self.dictionary.doc2bow(doc) for doc in corpora_stemmed]
        # self.tfidf = gensim.models.TfidfModel(corpora_bow)
        # corpora_tfidf = self.tfidf[corpora_bow]

        self.lda = LdaMulticore(num_topics=self.num_topics,
                                corpus=corpora_bow,
                                id2word=self.dictionary)

        if print_topics:
            for s in self.lda.print_topics():
                print(s)

    def get(self, doc):
        doc = self.tokenizer.tokenize(
            self._keep_only_russian_chars(doc.lower()))
        doc = [
            self.stemmer.stem(token) for token in doc
            if token not in ru_stopwords
        ]
        doc = [token for token in doc if token not in ru_stopwords]
        doc = self.dictionary.doc2bow(doc)
        # doc = self.tfidf[doc]
        return self.lda[doc]

    @staticmethod
    def _keep_only_russian_chars(s):
        new_s = ''
        for c in s:
            if 'а' <= c <= 'я' or 'А' <= c <= 'Я':
                new_s += c
            else:
                new_s += ' '
        return new_s
def main():
    corpus = []
    dictionary = corpora.Dictionary()
    #tokenized_doc = pd.Series()
    print("start")
    idx = 1
    print("Topics(", NUM_TOPICS, "개)")
    print("Docs (", NUM_DOCS - idx, "개)")
    while (True):
        if idx > NUM_DOCS: break
        print("##", idx, "~", idx + DOC_SPLIT - 1, "docs")
        print("docs loading...")
        news_df = get_posts_df(get_coll(), idx, DOC_SPLIT)

        print("docs tokenizing...")
        tokenized_doc = news_df['text'].apply(lambda x: tkn_func(x, idx))

        print("make Dict...")
        dictionary.add_documents(tokenized_doc)
        print("Token to Corpus...")
        corpus += [dictionary.doc2bow(text) for text in tokenized_doc]
        idx += DOC_SPLIT
        get_time()
        print()
    ## 싱글 코어
    # ldamodel = gensim.models.ldamodel.LdaModel(
    # 			corpus,
    # 			num_topics = NUM_TOPICS,
    # 			id2word = dictionary,
    # 			passes=20) # passes 알고리즘 반복 횟수
    ## 멀티코어
    get_time()
    print("Model Learning...")
    ldamodel = LdaMulticore(corpus,
                            num_topics=NUM_TOPICS,
                            id2word=dictionary,
                            passes=20,
                            workers=4)
    topics = ldamodel.print_topics(num_words=5)  # 토픽 단어 제한
    #토픽 및 토픽에 대한 단어의 기여도
    for topic in topics:
        print(topic)
    for i, topic_list in enumerate(ldamodel[corpus]):
        if i == 5:
            break
        print(i, '번째 문서의 topic 비율:', topic_list)
    get_time()
    print("model saving...")
    save_model(ldamodel, dictionary)
    visual(ldamodel, corpus, dictionary)
    print("end")
def write_results(lda_model: lda.LdaMulticore, df_topic_doc_keywords,
                  topicsFile: str, topicToDocFile: str):
    # Format
    df_dominant_topic = df_topic_doc_keywords.reset_index()
    df_dominant_topic.columns = [
        'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords',
        'Text'
    ]
    df_selected = df_dominant_topic[['Document_No', 'Dominant_Topic']]
    #print (df_selected.head(100))
    np.savetxt(topicToDocFile, df_selected.values, fmt='%s')

    with open(topicsFile, "w") as file:
        pprint(lda_model.print_topics(-1, 10), file)
Exemple #9
0
def main():
    corpus = []
    dictionary = corpora.Dictionary()

    print("start")
    print("docs loading...")
    df = pd.read_csv("news_data.csv")
    idx = 0
    last = len(df)
    while (True):
        if idx > last: break
        print("##", idx, "docs")
        news_df = df.loc[idx:idx, :]
        print("docs tokenizing...")
        tokenized_doc = news_df['text'].apply(lambda x: tkn_func(x, idx))
        print("make Dict...")
        dictionary.add_documents(tokenized_doc)
        print("Token to Corpus...")
        corpus += [dictionary.doc2bow(text) for text in tokenized_doc]
        idx += 1
        get_time()
        print()
    ## 싱글 코어
    # ldamodel = gensim.models.ldamodel.LdaModel(
    # 			corpus,
    # 			num_topics = NUM_TOPICS,
    # 			id2word = dictionary,
    # 			passes=20) # passes 알고리즘 반복 횟수
    ## 멀티코어
    get_time()
    print("Model Learning...")
    ldamodel = LdaMulticore(corpus,
                            num_topics=NUM_TOPICS,
                            id2word=dictionary,
                            passes=20,
                            workers=4)
    topics = ldamodel.print_topics(num_words=5)  # 토픽 단어 제한
    #토픽 및 토픽에 대한 단어의 기여도
    for topic in topics:
        print(topic)
    for i, topic_list in enumerate(ldamodel[corpus]):
        if i == 5:
            break
        print(i, '번째 문서의 topic 비율:', topic_list)
    get_time()
    print("model saving...")
    save_model(ldamodel, dictionary)
    visual(ldamodel, corpus, dictionary)
    print("end")
Exemple #10
0
def test_lda_model():
    dictionary = Dictionary(TOKEN_SETS)
    bags_of_words = [dictionary.doc2bow(tokens) for tokens in TOKEN_SETS]
    lda = LdaMulticore(corpus=bags_of_words,
                       id2word=dictionary,
                       random_state=723812,
                       passes=10,
                       workers=4)
    response = lda.print_topics()
    assert isinstance(response, list)
    assert isinstance(response[0], tuple)
    assert isinstance(response[0][0], np.int64)
    assert isinstance(response[0][1], str)
    topic_strings = [topic_str for topic_str in response[0][1].split(" + ")]
    assert topic_strings[0] == '0.067*"sleep"'
def learn(corpus):
    dictionary = Dictionary.load('lda.dict')
    lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, chunksize=10000, passes=5)
    for line in lda.print_topics(NUM_TOPICS):
        print line
    lda.save('lda.gensim')
# topic_number, number_of_aritcles, top_words
#==============================================================================
def get_topic(n):
    doc_lda = model[doc_list[n]]    
    current_prob = 0
    for var in doc_lda:
        if var[1]>current_prob:
            current_prob = var[1]
            topic_num = var[0]
    return topic_num,re.sub('[+.0123456789\*]','',topic[topic_num])

doc_list = []
for var in matutils.Sparse2Corpus(X,documents_columns=False):
    doc_list.append(var)

topic = model.print_topics(num_topics=topic_number, num_words=50)

# store topic with probability
with open(folder_name+'topic_with_prob_'+str(topic_number)+'_topics.txt','w') as new:
    for i in range(topic_number):
        new.write('{}\t{}\n'.format(str(i),topic[i]))
        

fin_sum = []
for i in range(len(doc_list)):
    fin_sum.append(get_topic(i)[0])
topic_count = co.Counter(fin_sum)

#path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt'
path = folder_name+'top_words_for_'+str(topic_number)+'_topics.txt'
#==============================================================================
def get_topic(n):
    doc_lda = model[doc_list[n]]
    current_prob = 0
    for var in doc_lda:
        if var[1] > current_prob:
            current_prob = var[1]
            topic_num = var[0]
    return topic_num, re.sub('[+.0123456789\*]', '', topic[topic_num])


doc_list = []
for var in matutils.Sparse2Corpus(X, documents_columns=False):
    doc_list.append(var)

topic = model.print_topics(num_topics=topic_number, num_words=50)

# store topic with probability
with open(folder_name + 'topic_with_prob_' + str(topic_number) + '_topics.txt',
          'w') as new:
    for i in range(topic_number):
        new.write('{}\t{}\n'.format(str(i), topic[i]))

fin_sum = []
for i in range(len(doc_list)):
    fin_sum.append(get_topic(i)[0])
topic_count = co.Counter(fin_sum)

#path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt'
path = folder_name + 'top_words_for_' + str(topic_number) + '_topics.txt'
Exemple #14
0
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    other_texts.append(stemmed_tokens)

other_corpus = [dictionary.doc2bow(text) for text in other_texts]

# unseen_doc = other_corpus[2]

# vector = ldamodel[unseen_doc]

# print(vector)

# generate LDA model-------------------------------------------------------------------------

my_loop_num_topics = [2, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100]

for i in my_loop_num_topics:
    my_num_topics = i
    print(my_num_topics)
    # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20)
    myldamodel = LdaMulticore(corpus,
                              num_topics=my_num_topics,
                              id2word=dictionary,
                              workers=3,
                              alpha=1e-5,
                              eta=5e-1)
    print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5))
    print(myldamodel.log_perplexity(corpus))
    print(myldamodel.log_perplexity(other_corpus))
Exemple #15
0
    #

    dictionary = Dictionary(token_stream(NOVELS_DIRPATH))
    dictionary.filter_extremes(no_below=10, no_above=0.66) # excludes terms like "the", "to", "and", "of", "i", etc.
    print("-------------")
    print("TOKENS", len(dictionary.token2id), list(dictionary.token2id.items())[0:4], "...")

    bags_of_words = [dictionary.doc2bow(tokens) for tokens in token_stream(NOVELS_DIRPATH)]
    print("-------------")
    print("BAGS OF WORDS (CORPUS)", len(bags_of_words), bags_of_words[0])

    lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, num_topics=15, passes=10, workers=4)
    print("-------------")
    print("LDA MODEL", type(lda))

    results = lda.print_topics()
    print("-------------")
    print("TOPICS (RAW RESULTS)...")
    print(results)

    parsed_topics = parse_topics(lda)
    print("-------------")
    print("TOPICS (PARSED RESULTS)...")
    pprint(parsed_topics)

    # h/t: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling
    topics = lda[bags_of_words]
    print(topics[0]) #> [(4, 0.3149784), (7, 0.47801575), (13, 0.20485382)]

    # a measure of how good the model is. lower the better.
    print("Perplexity:", lda.log_perplexity(bags_of_words)) #> -7.74115184561741
Exemple #16
0
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
my_num_topics = 30
# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20)
ldamodel = LdaMulticore(corpus,
                        num_topics=my_num_topics,
                        id2word=dictionary,
                        workers=3,
                        alpha=1e-5,
                        eta=5e-1)

print(ldamodel.print_topics(num_topics=my_num_topics, num_words=5))
print(corpus[0])
print(corpus[1])
print(corpus[2])
print(ldamodel[corpus[0]])
print(ldamodel[corpus[1]])
print(ldamodel[corpus[2]])

print(ldamodel.print_topics(20))

#----------------------------------------------------------------------
new_texts_set = [
    'comedy collection comedy favorites', 'alternative punk blue',
    'human game computer house'
]
Exemple #17
0
    print type(dictionary), type(corpus)

    #path where dtm file is installed
    dtm_path = "/home/ankit081190/NLP/dtm/dtm/dtm"

    #model = DtmModel(dtm_path, corpus, time_seq, num_topics=1,
    #                 id2word=corpus.dictionary, initialize_lda=True)

    model = LdaMulticore(corpus, num_topics=10, id2word=dictionary)

    model.save("DTModelMultiCore_" + files + ".model")
    #Gives top 25 topics

    tp = model.show_topics(num_topics=25, log=False, formatted=True)
    print model.print_topics(num_topics=25)
    data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
    pyLDAvis.save_html(data, 'index_lda_' + files + '.html')

    cnt = Counter(tp)
    with codecs.open("topicsMultiLDA" + files + ".txt", "w", "utf-8") as f:
        for i, j in cnt:
            print i, j
            f.write("\nFor Topic Number " + str(i) + ":\n" +
                    str(j).decode("utf-8") + "\n")
        f.close()

    #for i, j in cnt:
    #    print "\nFor topic number: " ,i, "\n";
    #    print j.decode("utf-8")
    #for i in range(0,model.num_topics-1)):
model = LdaMulticore(
                    matutils.Sparse2Corpus(X,documents_columns=False), 
                    num_topics=7,passes=10,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                    workers=7,
                    )
print("--- %s seconds ---" % (time.time() - start_time))



# Get all topics from training 
doc_list = []
for var in matutils.Sparse2Corpus(X,documents_columns=False):
    doc_list.append(var)

topic = model.print_topics(num_topics=7, num_words=10)

fin_sum = []
for i in range(len(doc_list)):
    fin_sum.append(get_topic(i)[0])
topic_count = co.Counter(fin_sum)

for i,var in enumerate(topic):
    [i,str(re.sub('[+.0123456789\*]','',var)),topic_count[i]]

    

# [topic,topic_words,doc_title]
for i in range(100):
    [get_topic(i),titles[i]]
 
start_time = time.time()
model = LdaMulticore(
    matutils.Sparse2Corpus(X, documents_columns=False),
    num_topics=7,
    passes=10,
    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
    workers=7,
)
print("--- %s seconds ---" % (time.time() - start_time))

# Get all topics from training
doc_list = []
for var in matutils.Sparse2Corpus(X, documents_columns=False):
    doc_list.append(var)

topic = model.print_topics(num_topics=7, num_words=10)

fin_sum = []
for i in range(len(doc_list)):
    fin_sum.append(get_topic(i)[0])
topic_count = co.Counter(fin_sum)

for i, var in enumerate(topic):
    [i, str(re.sub('[+.0123456789\*]', '', var)), topic_count[i]]

# [topic,topic_words,doc_title]
for i in range(100):
    [get_topic(i), titles[i]]

#help(model)
Exemple #20
0
def start(num_topics, kind):
    data = loader.load_data(kind)
    df = pd.DataFrame(data)
    cleaner.clean(df)

    nlps = {
        'it': spacy.load('it_core_news_lg'),
        'en': spacy.load('en_core_web_lg'),
        'fr': spacy.load('fr'),
        'de': spacy.load('de')
    }

    tokenizers = {
        'it': Tokenizer(nlps['it'].vocab),
        'en': Tokenizer(nlps['en'].vocab),
        'fr': Tokenizer(nlps['fr'].vocab),
        'de': Tokenizer(nlps['de'].vocab)
    }

    # Customize stop words by adding to the default list
    stop_words = []
    stop_words += nlps['it'].Defaults.stop_words
    stop_words += nlps['en'].Defaults.stop_words
    stop_words += nlps['fr'].Defaults.stop_words
    stop_words += nlps['de'].Defaults.stop_words
    stop_words += s.ALL_STOPWORDS
    stop_words = set(stop_words)

    # ALL_STOP_WORDS = spacy + gensim + wordcloud
    ALL_STOP_WORDS = stop_words.union(SW).union(stopwords)

    cleaner.remove_stopwords(df, tokenizers, ALL_STOP_WORDS)
    cleaner.lemmas(df, nlps)

    tok.tokenize_text(df)

    # Create a id2word dictionary
    id2word = Dictionary(df['lemma_tokens'])
    print(len(id2word))

    # Filtering Extremes
    id2word.filter_extremes(no_below=2, no_above=.99)
    print(len(id2word))

    # Creating a corpus object
    corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

    # Instantiating a Base LDA model
    base_model = LdaMulticore(corpus=corpus,
                              num_topics=num_topics,
                              id2word=id2word,
                              workers=12,
                              passes=5)

    # Filtering for words
    words = [re.findall(r'"([^"]*)"', t[1]) for t in base_model.print_topics()]

    # Create Topics
    topics = [' '.join(t[0:10]) for t in words]

    # Getting the topics
    for id, t in enumerate(topics):
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")

    # Compute Perplexity
    # a measure of how good the model is. lower the better
    base_perplexity = base_model.log_perplexity(corpus)
    print('\nPerplexity: ', base_perplexity)

    # Compute Coherence Score
    coherence_model = CoherenceModel(model=base_model,
                                     texts=df['lemma_tokens'],
                                     dictionary=id2word,
                                     coherence='c_v')
    coherence_lda_model_base = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model_base)

    lda_display = pyLDAvis.gensim.prepare(base_model, corpus, id2word)
    d = pyLDAvis.display(lda_display)

    today = date.today()
    directory_path = f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/"
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    f = open(
        f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/{num_topics}.html",
        'w')
    f.write(d.data)
    f.close()

    vectorizer = CountVectorizer()
    data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])

    # Define Search Param
    search_params = {
        'n_components': [10, 15, 20, 25, 30],
        'learning_decay': [.5, .7, .9]
    }

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)

    # Do the Grid Search
    model.fit(data_vectorized)
    GridSearchCV(cv=None,
                 error_score='raise',
                 estimator=LatentDirichletAllocation(batch_size=128,
                                                     doc_topic_prior=None,
                                                     evaluate_every=-1,
                                                     learning_decay=0.7,
                                                     learning_method=None,
                                                     learning_offset=10.0,
                                                     max_doc_update_iter=100,
                                                     max_iter=10,
                                                     mean_change_tol=0.001,
                                                     n_components=10,
                                                     n_jobs=1,
                                                     perp_tol=0.1,
                                                     random_state=None,
                                                     topic_word_prior=None,
                                                     total_samples=1000000.0,
                                                     verbose=0),
                 iid=True,
                 n_jobs=1,
                 param_grid={
                     'n_topics': [10, 15, 20, 30],
                     'learning_decay': [0.5, 0.7, 0.9]
                 },
                 pre_dispatch='2*n_jobs',
                 refit=True,
                 return_train_score='warn',
                 scoring=None,
                 verbose=0)

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
class LDA():

    corpus = None
    model = None
    dictionary = None
    util = None
    loaded = False
    topicLabelling = defaultdict(int)

    def __init__(self, utilObj=None, logfilename=None):
        if (utilObj != None):
            self.util = utilObj
        elif (logfilename != None):
            self.util = Utilities.Utility()
            self.util.setupLogFileLoc(logfilename)

        self.util.startTimeTrack()

    def labelTopics(self, modelFilename):

        if (os.path.exists(modelFilename + '.label')):
            f = open(modelFilename + '.label', "rb")
            self.topicLabelling = pickle.load(f)
            f.close()
        else:
            #Label file not available, performing manual labelling. (One time operation)
            topics = self.model.show_topics(num_topics=100, num_words=20)
            print(
                'You will be shown a series of words and asked to label the topic in the form of an integer\n'
            )
            for topic in topics:
                print('The words affliated to this topic is as follows\n',
                      topic[1])
                print(
                    '\033[92m' +
                    'Please label as one of these \n(0) EDUCATION\n(1) SKILLS\n(2) PERSONAL DETAILS\n(3) WORK EXPERIENCE'
                    + '\033[0m')
                mappedTopicInt = input(
                    'Please enter a new integer for this topic: ')
                self.topicLabelling[topic[0]] = mappedTopicInt
            f = open(modelFilename + '.label', "wb")
            pickle.dump(self.topicLabelling, f)
            f.close()

    def buildCorpus(self, folderListOfCorpus=None, maxdocs=-1):
        """
        For each folder
            for each cvd2v in in folder
                Get tokens from Utility tokenise and then form into a string
                Append string into a list (This forms a document)
        """
        self.util.logDebug('LDA', 'Building and fitting corpus ')
        documentList = []
        maxDocPerFolder = int(maxdocs / len(folderListOfCorpus.split(',')))
        docCounter = 0
        for folder in folderListOfCorpus.split(','):
            self.util.logDebug('LDA', 'Processing ' + folder)
            for filename in sorted(glob.iglob(folder + '/*.cvd2v')):
                if (docCounter <= maxDocPerFolder):
                    fileContent = self.util.tokensToStr(
                        self.util.tokenize(
                            self.util.readFileContent(filename=filename),
                            removeStopwords=True,
                            toLowercase=True,
                            replaceSlash=True,
                            flatEmail=True,
                            flatMonth=True,
                            flatNumber=True,
                            lemmatize=True), ' ')
                    documentList.append(fileContent)
                    docCounter = docCounter + 1
                else:
                    docCounter = 0
                    break

        self.util.logDebug(
            'LDA',
            str(len(documentList)) + ' documents loaded in ' +
            self.util.stopTimeTrack())
        texts = [[word for word in document.lower().split()]
                 for document in documentList]
        self.util.logDebug('LDA', 'No of vocab words: ' + str(len(texts)))
        self.util.logDebug('LDA', 'Text example: ' + str(texts[0]))
        self.dictionary = Dictionary(texts)

        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        self.util.logDebug('LDA',
                           'Corpus built in ' + self.util.stopTimeTrack())

    def trainModel(self, noOfTopics=4, dstFilename=None):
        workers = 30
        eval_every = 10
        iterations = 400
        passes = 20

        self.util.logDebug('LDA', 'Training model...')
        self.model = LdaMulticore(self.corpus,
                                  workers=workers,
                                  num_topics=noOfTopics,
                                  id2word=self.dictionary,
                                  eval_every=None,
                                  iterations=iterations,
                                  passes=passes)
        self.util.logDebug('LDA',
                           'Model trained in ' + self.util.stopTimeTrack())
        print(self.model.print_topics())
        self.saveModel(dstFilename)
        self.loaded = True

    def saveModel(self, filename):
        self.util.logDebug('LDA', 'Saving model to ' + filename)
        self.model.save(filename)
        self.dictionary.save(filename + '.dict')
        MmCorpus.serialize(filename + '.corpus', self.corpus)
        self.util.logDebug('LDA', 'Saved in ' + self.util.stopTimeTrack())

    def loadModel(self, filename):
        self.util.logDebug('LDA', 'Loading model from ' + filename)
        self.model = LdaMulticore.load(fname=filename)
        self.dictionary = Dictionary.load(fname=filename + '.dict')
        self.corpus = MmCorpus(filename + '.corpus')
        print(self.dictionary)
        print(self.model.print_topic(0, topn=5))
        print(self.model.print_topic(1, topn=5))
        print(self.model.print_topic(2, topn=5))
        print(self.model.print_topic(3, topn=5))
        self.loaded = True
        self.util.logDebug('LDA',
                           'Model loaded in ' + self.util.stopTimeTrack())
        self.labelTopics(filename)

    def getTopTopic(self, inferenceOutput):
        thisDict = defaultdict(int)
        probList = []
        for topic, prob in inferenceOutput:
            thisDict[str(prob)] = topic
            probList.append(prob)
        largestProb = max(probList)
        mostLikelyTopic = thisDict[str(largestProb)]
        return mostLikelyTopic

    def infer_topic_proba(self, string):
        import numpy as np
        prediction = [0.0, 0.0, 0.0, 0.0]
        if (self.loaded):
            bow = self.dictionary.doc2bow(self.util.tokenize(string))
            results = self.model.get_document_topics(bow)
            for result in results:
                prediction[result[0]] = result[1]
        else:
            self.util.logError('LDA', 'Model is not loaded, cannot infer')
        prediction = np.array(prediction)
        return prediction

    def infer_topic(self, string):
        results = None
        if (self.loaded):
            bow = self.dictionary.doc2bow(self.util.tokenize(string))
            results = self.model.get_document_topics(bow)
        else:
            self.util.logError('LDA', 'Model is not loaded, cannot infer')
        results = self.getTopTopic(results)
        return results

    def visualizeLDA(self, filename):

        dictionary = Dictionary.load(filename + '.dict')
        corpus = MmCorpus(filename + '.corpus')
        lda = LdaMulticore.load(filename)
        self.util.logDebug('LDA', 'Preparing HTML ')
        ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        self.util.logDebug('LDA',
                           'HTML prepared in ' + self.util.stopTimeTrack())
        pyLDAvis.save_html(ldavis, filename + '.html')
        self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack())


#
# lda = LDA(logfilename='/home/kah1/test.log')
# lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model')
# lda.labelTopics()
Exemple #22
0
#==============================================================================
def get_topic(n):
    doc_lda = model[doc_list[n]]
    current_prob = 0
    for var in doc_lda:
        if var[1] > current_prob:
            current_prob = var[1]
            topic_num = var[0]
    return topic_num, re.sub('[+.0123456789\*]', '', topic[topic_num])


doc_list = []
for var in matutils.Sparse2Corpus(X, documents_columns=False):
    doc_list.append(var)

topic = model.print_topics(num_topics=9, num_words=50)

# store topic with probability
with open(
        '/Users/royyang/Desktop/trending_project/re_categorization_ls/topic_with_prob.txt',
        'w') as new:
    for i in range(9):
        new.write('{}\t{}\n'.format(str(i), topic[i]))

fin_sum = []
for i in range(len(doc_list)):
    fin_sum.append(get_topic(i)[0])
topic_count = co.Counter(fin_sum)

#path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt'
path = '/Users/royyang/Desktop/trending_project/re_categorization_ls/top_words_9topics.txt'
        #     #     if x in idmap:
        #     #         return x
        #     #     else:
        #     #         return -1
        #     for idx, (doc_id, document) in enumerate(corpus.documents.items()):
        #         if idx % 1000 == 0:
        #             logger.info("remapping: %d documents finished" % idx)
        #         # corpus.documents[doc_id] = [check_and_replace(oldid) for oldid in document]
        #         corpus.documents[doc_id] = [idmap[oldid] for oldid in document if oldid in idmap]

        corpus.save_tbmm_corpus(args.corpus_filename)

        if args.train_lda:
            # from gensim.models.ldamodel import LdaModel
            from gensim.models.ldamulticore import LdaMulticore

            # setting metadata to False is required because of the way logperplexity code requires the
            # output of get_texts to be.
            corpus.metadata = False
            lda = LdaMulticore(workers=19,
                               corpus=corpus,
                               id2word=corpus.dictionary,
                               num_topics=20,
                               eval_every=100,
                               chunksize=100,
                               passes=5)

            lda.print_topics(20)

            lda.save(args.corpus_filename + ".tbmm_lda.model")
Exemple #24
0
def main():
    df = read_forum_json('json/levergunscommunity.com.json')
    corpus, dictionary = generate_corpus(df)
    lda = LdaMulticore(corpus, num_topics=20, id2word=dictionary, workers=3)
    lda.print_topics(num_topics=20, num_words=20)
Exemple #25
0
#     workers=3,
#     eval_every=eval_every)

# Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=35, 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)


pprint(model.print_topics())
doc_lda = model[corpus]
doc_lda[4]
model.get_document_topics(corpus)[1]

# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_multicore, corpus, dictionary)
vis
Exemple #26
0
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

id2word = Dictionary(df['tokens'])

id2word.filter_extremes(no_below=2, no_above=.99)

corpus = [id2word.doc2bow(d) for d in df['tokens']]


# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5)

words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

topics = [' '.join(t[0:10]) for t in words]

# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

p=pyLDAvis.gensim.prepare(base_model, corpus, id2word)
pyLDAvis.save_html(p, 'biden_lda.html')

ldamodel.save('biden_model.gensim')

biden_df=df
# topic_number, number_of_aritcles, top_words
#==============================================================================
def get_topic(n):
    doc_lda = model[doc_list[n]]    
    current_prob = 0
    for var in doc_lda:
        if var[1]>current_prob:
            current_prob = var[1]
            topic_num = var[0]
    return topic_num,re.sub('[+.0123456789\*]','',topic[topic_num])

doc_list = []
for var in matutils.Sparse2Corpus(X,documents_columns=False):
    doc_list.append(var)

topic = model.print_topics(num_topics=9, num_words=50)

# store topic with probability
with open('/Users/royyang/Desktop/trending_project/re_categorization_ls/topic_with_prob.txt','w') as new:
    for i in range(9):
        new.write('{}\t{}\n'.format(str(i),topic[i]))
        

fin_sum = []
for i in range(len(doc_list)):
    fin_sum.append(get_topic(i)[0])
topic_count = co.Counter(fin_sum)

#path = '/Users/royyang/Desktop/trending_project/re_categorization_ehow/top_words_28topics.txt'
path = '/Users/royyang/Desktop/trending_project/re_categorization_ls/top_words_9topics.txt'
sentence_length = [len(tokens) for tokens in clean]
#len([i for i in sentence_length if i > 100])

#LDA Model - Select key concerns
dictionary = corpora.Dictionary(clean)
dictionary.filter_extremes(no_below=100, no_above=0.5, keep_n=10000)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean]
tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]

ldamodel = LdaMulticore(corpus_tfidf,
                        num_topics=7,
                        id2word=dictionary,
                        passes=100)
print(*ldamodel.print_topics(num_topics=7, num_words=20), sep='\n')
lda_display = pyLDAvis.gensim.prepare(ldamodel,
                                      doc_term_matrix,
                                      dictionary,
                                      sort_topics=False)

# keywords dictionary
# key_lda = {'bonus',
#  'business',
#  'career',
#  'change',
#  'collaboration',
#  'communication',
#  'consumer',
#  'cost',
#  'customer',