Beispiel #1
0
def hierarchical_dirichlet_process(corpus, num_topics, id2word):
    ''' HIERARCHICAL DIRICHLET PROCESS
    # Advantage of HDP: fully unsupervised: can determine the ideal number of topics it needs through posterior inference
    '''
    print 'Hierarchical Dirichlet Process'
    hdp_model = HdpModel(corpus = corpus, id2word = id2word)
    hdp_model.show_topics()
    hdp_topic = hdp_model.show_topics(formatted = False)
    return hdp_model
Beispiel #2
0
def train_hdp_model(corpus, dictionary, chunksize):
    print('HDP model')
    model = HdpModel(corpus=corpus, id2word=dictionary, chunksize=chunksize, random_state=config.SEED)
    # To get the topic words from the model
    topics = []
    for topic_id, topic in model.show_topics(num_topics=10, formatted=False):
        topic = [word for word, _ in topic]
        topics.append(topic)
    return model
    def topicsHDP(self, num_topics=-1, topn=20):
        # HdpModel(corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None)
        hdp = HdpModel(corpus=self.corpus, id2word=self.id2word)

        # show_topics(topics=20, topn=20, log=False, formatted=True)
        # Print the topN most probable words for topics number of topics. Set topics=-1 to print all topics.
        # Set formatted=True to return the topics as a list of strings, or False as lists of (weight, word) pairs.

        return hdp.show_topics(topics=num_topics, topn=topn, formatted=False)
Beispiel #4
0
    def topicsHDP(self, num_topics=-1, topn=20):
        # HdpModel(corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None)
        hdp = HdpModel(corpus=self.corpus, id2word=self.id2word)

        # show_topics(topics=20, topn=20, log=False, formatted=True)
        # Print the topN most probable words for topics number of topics. Set topics=-1 to print all topics.
        # Set formatted=True to return the topics as a list of strings, or False as lists of (weight, word) pairs.

        return hdp.show_topics(topics=num_topics, topn=topn, formatted=False)
def comparison(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LSI Model output')
    print(lsimodel.show_topics())

    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
    print('hdp model output')
    print(hdpmodel.show_topics())

    ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LDA Model output')
    print(ldamodel.show_topics())


    pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

    lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

    def evaluate_bar_graph(coherences, indices):
        assert len(coherences) == len(indices)
        n = len(coherences)
        x = np.arange(n)
        plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
        plt.xlabel('Models')
        plt.ylabel('Coherence Value')
        plt.show()

    evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
Beispiel #6
0
    def get_topics(self, corpus, vocabulary, num_words=10):

        hdpmodel = HdpModel(corpus=corpus, id2word=vocabulary)
        # Docs say that if -1 all topics will be in result (ordered by significance). num_words is optional.
        # .print_topics(num_topics=20, num_words=10)
        # Docs are wrong. If you use -1 the list will be empty. So just don't specify the num_topics:
        topics = hdpmodel.show_topics(formatted=False,
                                      num_words=num_words,
                                      num_topics=-1)
        #print(hdpmodel.get_topics().shape)
        return topics
Beispiel #7
0
    def runModels(self, number_of_topics, corpus, dictionary, start, end):

        #do hdp model

        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

        hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10)
        hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(hdptopics)

        #add results to total kept in a list
        #   addToResults(result_dict)

        #output results
        self.printResults(number_of_topics, hdptopics, 'hdp', start, end)

        #d lda model
        ldamodel = LdaModel(corpus=corpus,
                            num_topics=number_of_topics,
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

        ldamodel.save('lda' + number_of_topics + '.model')
        ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics))

        #   result_dict=addTotalTermResults(ldatopics)
        #   addToResults(result_dict)
        self.printResults(number_of_topics, ldatopics, 'lda', start, end)

        visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

        location = os.path.join(pn, 'topic_model_results')

        #visualize outputs in html
        pyLDAvis.save_html(
            visualisation,
            os.path.join(
                location, 'LDA_Visualization' + str(number_of_topics) + "_" +
                start + "_" + end + '.html'))
Beispiel #8
0
 def gensimTopicModelingAnalysis(self, n):
     files = glob.glob(
         "/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/*.txt")
     files = sorted(
         files,
         key=lambda x: int(
             x.split(
                 '/Users/advaitbalaji/Downloads/IslandAnalysis/Atleast2/Cluster'
             )[1].split('_')[0]))
     with open("/Users/advaitbalaji/Desktop/ListofSortedClusters.txt",
               "w") as of:
         for f in files:
             of.writelines(f + "\n")
     texts, clusters = n.readMultipleFileLineWise(files)
     dictionary = Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
     print(hdpmodel.show_topics())
Beispiel #9
0
    #Now we go through preprocessing for each article
    #for a in articles:
    #print(articles[0])

    processed_articles = []
    for a in articles:
        processed_articles.append(preprocess(a))

    dictionary = g.corpora.Dictionary(processed_articles)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_articles]
    bow_doc_x = bow_corpus[1]

    #for i in range(len(bow_doc_x)):
    #    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]))
    """
    lda_model =  g.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

    for idx, topic in lda_model.print_topics(-1):
        print("Topic: {} \nWords: {}".format(idx, topic ))
        print("\n")
    """

    hdpmodel = HdpModel(corpus=bow_corpus, id2word=dictionary)
    for t in hdpmodel.show_topics():
        print('TOPIC: ')
        print(t)
Beispiel #10
0
dictionary = Dictionary(texts)

# (word id, number of times word appears in document)
corpus = [dictionary.doc2bow(text) for text in texts]

# latent semantic indexing, a popular information retrieval method,
# which works by decomposing the original matrix of words to
# maintain key topics. Gensim's implementation uses an SVD.
lsi_model = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
lsi_topics = lsi_model.show_topics(num_topics=5)
print(lsi_topics)

# hierarchical dirichlet process is an unsupervised topic model which
# determines the number of topics on its own
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)
hdp_topics = hdp_model.show_topics()
print(hdp_topics)

# latent dirichlet allocation
lda_model = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
lda_topics = lda_model.show_topics()
print(lda_topics)

lsi_topics_clean = [[
    word for word, prob in topic
] for topic_id, topic in lsi_model.show_topics(formatted=False)]
print(lsi_topics_clean)
hdp_topics_clean = [[
    word for word, prob in topic
] for topic_id, topic in hdp_model.show_topics(formatted=False)]
print(hdp_topics_clean)
	
	for word in temp:
		docmap[word]=i

		
	i=i+1
	train_text.append(temp)

print("no of entries in train text is %d"%len(train_text))

dictionary = Dictionary(train_text)

corpus = [dictionary.doc2bow(text) for text in train_text]


hdpmodel1 = HdpModel(corpus=corpus, id2word=dictionary)


x=hdpmodel1.show_topics(num_topics=30,num_words=200)


twords={}
for topic,word in x:
	twords[str(topic)]=(re.sub('[^A-Za-z ]+', '', word)).split()


mycol3.insert_one(twords)
mycol4.insert_one(docmap)


def build_hdp(corpus, id2word):
    hdpmodel = HdpModel(corpus=corpus, id2word=id2word, chunksize=2000)
    hdptopics = hdpmodel.show_topics(formatted=False)
    hdptopics = [[word for word, prob in topic]
                 for topicid, topic in hdptopics]
    return hdpmodel, hdptopics
        article = []
        article_spell = []

print('Generating corpus...')
# generate dictionary and corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
dictionary_spell = Dictionary(texts_spell)
corpus_spell = [dictionary_spell.doc2bow(text) for text in texts_spell]

# HDP Hierarchical Dirichlet Process - unsupervised method that determines number of topics itself
print('HDP Hierarchical Dirichlet Process')
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)
with open('./tm_results.txt', 'w') as f:
    f.write('Without Spelling Correction\nHDP\n')
    for topic in hdp_model.show_topics(formatted=True):
        f.write('{}\t{}\n'.format(topic[0], topic[1]))

num_topics = len(hdp_model.show_topics())

# LSI Latent Symantex Indexing
print('LSI Latent Symantex Indexing')
lsi_model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
with open('./tm_results.txt', 'a') as f:
    f.write('LSI\n')
    for topic in lsi_model.show_topics(formatted=True):
        f.write('{}\t{}\n'.format(topic[0], topic[1]))

# LDA Latent Dirichlet Allocation
print('LDA Latent Dirichlet Allocation')
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
Beispiel #14
0
    if count % 100000 == 0:
        print(count, end=' ')

with open("texts.txt", "wb") as fp:  # Pickling
    pickle.dump(texts, fp)
print('texts.csv created')

bigram = gensim.models.Phrases(texts)

dictionary = Dictionary(texts)
dictionary.save("hdp_dictionary.dict")
print("Dictionary saved as hdp_dictionary.dict")
corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('hdp_corpus.mm', corpus)
print('Corpus saved as hdp_corpus.mm')

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

hdpmodel.save('hdp_model_spacy.gensim')
print('hdp model created')

hdptopics = [[word for word, prob in topic]
             for topicid, topic in hdpmodel.show_topics(formatted=False)]

hdp_coherence = CoherenceModel(topics=hdptopics[:10],
                               texts=texts,
                               dictionary=dictionary,
                               window_size=10).get_coherence()

print(f"The topic coherence is {hdp_coherence}")
Beispiel #15
0
                   id2word=dictionary,
                   update_every=5,
                   chunksize=10000,
                   passes=100)
    lda.save('/tmp/model.lda')
else:
    lda = LdaModel.load('/tmp/model.lda')
lda.show_topics()
topics_matrix = lda.show_topics(formatted=False, num_words=7)

print(topics_matrix)
print(len(topics_matrix))

for topic in topics_matrix:
    i = topic[1]
    print([str(word) for word in i])
#
# topics_matrix = np.array(topics_matrix)
#
# topic_words = topics_matrix[:, :, 1]
# for i in topic_words:
#     print([str(word) for word in i])


# otro modelo mas para categorizar documentos, Hierarchical Dirichlet Process
print("HDP")
model = HdpModel(corpus, id2word=dictionary)
model.show_topics(log=True, topics=5)

#  ver https://radimrehurek.com/gensim/tut2.html
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])
#corpus.shape

#Making the LSI Model
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=id2word)
lsimodel.show_topics()

#Making the HDP Model
hdpmodel = HdpModel(corpus=corpus, id2word=id2word)
hdpmodel.show_topics()

#Making the LDA Model
# corpus.shape
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)
ldamodel.show_topics()

#Making the LDA Model with 3 topics
ldamodel_3_topics = LdaModel(corpus=corpus, num_topics=3, id2word=id2word)
ldamodel_3_topics.show_topics()

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, id2word)

lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
Beispiel #17
0
    texts, id2word, corpus, full_data = pickle.load(handle)
# texts, id2word, corpus = generate_corpus.generate_corpus()

# View
print(corpus[:1])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

vectors_path = "/Users/ismglv/dev/lda2vec/GoogleNews-vectors-negative300.bin"
keyed_vectors = gensim.models.KeyedVectors.load_word2vec_format(vectors_path,
                                                                binary=True)

hdpmodel = HdpModel(corpus=corpus, id2word=id2word)

hdpmodel.show_topics()

coherence_model_hdp = CoherenceModel(model=hdpmodel,
                                     texts=texts,
                                     dictionary=id2word,
                                     coherence='c_v')

# coherence_hdp = coherence_model_hdp.get_coherence()
# print('\nCoherence Score: ', coherence_hdp)

coherence_model_cw2v = CoherenceModel(model=hdpmodel,
                                      texts=texts,
                                      dictionary=id2word,
                                      coherence='c_w2v',
                                      keyed_vectors=keyed_vectors)
# coherence_model_cnmpi = CoherenceModel(model=hdpmodel, texts=texts, dictionary=id2word, coherence='c_npmi')
Beispiel #18
0
def run():
    '''
    The method to run for implementing the topic models (LDA and HDP).
    The below code is executed to conduct the models for topic modelling and 
    coherence testing for LDA models
    '''
    topicN=raw_input("Number of topics:  ")
    fll=raw_input("Filter Terms: ")
    gov=raw_input("Government Material: ")
    fflt=fll.split(",")
    
    flt=[]
    for f in fflt:
        flt.append(f)
        
    ##filter based on government type
    ##filter based on sentences around mountain pine beetle
    ##do one filter at a time and then both together
           
    pn=os.path.abspath(__file__)
    pn=pn.split("src")[0]

    p=PatternMatcher()
   

    content=p.retrieveContent(flt,gov)
    results=integrateText(content)

    #results=retrieveText(results)

    bigram = gensim.models.Phrases(results) 
    #train_texts = process_texts(results)

    results=preProcsText(results)

    train_texts=process_texts(bigram,results)

    print('start')

    dictionary = Dictionary(train_texts)
    corpus = [dictionary.doc2bow(text) for text in train_texts]

    #keep track of iteration
    iiT=2

    #topics are tested based on a given topic number
    for i in range(2,int(topicN),1): 
#    lsi model
    
        print('run evaluation: '+ str(i))
    
        #lsimodel = LsiModel(corpus=corpus, num_topics=i, id2word=dictionary)

        #lsitopics=lsimodel.show_topics(num_topics=i)

        #result_dict=addTotalTermResults(lsitopics)    
        #addToResults(result_dict)
        #printResults(i,'lsi')
    
        del listResults[:]    
    
        #hdp model
        hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

        hdpmodel.show_topics()

        hdptopics = hdpmodel.show_topics(num_topics=i)

        result_dict=addTotalTermResults(hdptopics)
            
        #add results to total kept in a list     
        addToResults(result_dict)
    
        printResults(i,'hdp')
        del listResults[:] 
     
        #lda model
        ldamodel = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary)
        num=str(i)
        ldamodel.save('lda'+num+'.model')
        ldatopics = ldamodel.show_topics(num_topics=i)
    
        result_dict=addTotalTermResults(ldatopics)    
        addToResults(result_dict)
        printResults(i,'lda')
    
        del listResults[:] 
    
        visualisation2 = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
   
        location=os.path.join(pn,'results')
     
        #visualize outputs
        pyLDAvis.save_html(visualisation2, os.path.join(location,'LDA_Visualization'+str(i)+'.html')) 
    
    
    iiT=i

    print('evaluate graph')

    #coherence model evaluation
    lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=train_texts, limit=i)

    #lm, top_topics = ret_top_model()

    #coherence model results
    printEvaluation(lmlist,c_v,iiT)
Beispiel #19
0
    lexeme.is_stop = True

doc = nlp(clean(text))

# we add some words to the stop word list
texts, article = [], []
for w in doc:
    # if it's not a stop word or punctuation mark, add it to our article!
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article.append(w.lemma_)
    # if it's a new line, it means we're onto our next document
    if w.text == '\n':
        texts.append(article)
        article = []

#pprint(texts)

bigram = gensim.models.Phrases(texts)

texts = [bigram[line] for line in texts]

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

#pprint(texts)

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

print hdpmodel.show_topics()
                           'coherence': coherence_lda},
                         index = topic_count_lda)

topics_lda.head(10)

# %%
lines = topics_lda.plot.line(subplots = True)

# %% [markdown]
# ##### Gensim also includes Hierarchical Dirichlet process (HDP). HDP is a powerful mixed-membership model for 
# the unsupervised analysis of grouped data. Unlike its finite counterpart, latent Dirichlet allocation, 
# the HDP topic model infers the number of topics from the data. Here we have used Online HDP, 
# which provides the speed of online variational Bayes with the modeling flexibility of the HDP.
#
# See https://radimrehurek.com/gensim/models/hdpmodel.html

# %%
# Create a HDP model - default for hdp is 150
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

# %%
hdptopics = hdpmodel.show_topics(num_topics = 20, formatted=True)
hdptopics

# %%
hdp_topics = hdpmodel.get_topics()
hdp_topics.shape

# %%
hdpmodel.hdp_to_lda()
Beispiel #21
0
        #content.download()
        #content.parse()
        #content.nlp()
        article = {}

        # stemmed_words = set(stem_tokens(content.cleaned_text, stemmer))
        article['keywords'] = text
        article['url'] = u
        article['title'] = content.title
        articles.append(article)
    except:
        continue

#add existing articles to new articles
if len(timelines) > 0:
    recent_timelines = [*map(lambda t: t[0], timelines)]
    articles = recent_timelines + articles

texts = [*map(lambda x: x['keywords'], articles)]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

hdp = HdpModel(corpus, dictionary)
#print(hdp.print_topics(num_topics=3, num_words=10))
print(hdp.show_topics(num_topics=-1, num_words=10))

topics = hdp.print_topics(num_topics=-1)
texst = 1
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

print("processing dict and corpus")
list_all_paragraphe_split = [sentence.split()
                             for sentence in list_all_paragraphe_filtered]
dictionary = Dictionary(list_all_paragraphe_split)
corpus = [dictionary.doc2bow(text) for text in list_all_paragraphe_split]

print("Topics in HDP model :")
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdptopics = hdpmodel.show_topics(formatted=False)
print("there are "+str(len(hdptopics))+" topics in the data")
alpha = hdpmodel.hdp_to_lda()[0]
plt.figure()
plt.plot(alpha)
plt.show()

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (len(list_all_paragraphe_filtered), n_features))
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done with nmf ")
Beispiel #23
0
dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]

for i in range(10, 100, 10):
    lsimodel = LsiModel(corpus=corpus, num_topics=i, id2word=dictionary)

    lsitopics = lsimodel.show_topics(num_topics=i)

    result_dict = addTotalTermResults(lsitopics)
    addToResults(result_dict)
    printResults(i, 'lsi')

    del listResults[:]
    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

    hdpmodel.show_topics()

    hdptopics = hdpmodel.show_topics(num_topics=i)

    result_dict = addTotalTermResults(hdptopics)

    #add results to total kept in a list
    addToResults(result_dict)

    printResults(i, 'hdp')
    del listResults[:]

    ldamodel = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary)

    ldatopics = ldamodel.show_topics(num_topics=i)