class LdaMalletHandler:
    def __init__(self, mallet_path):
        self.mallet_path = mallet_path

    def run_model(self, model_name, corpus, **kwargs):
        self.model_name = model_name
        self.dictionary = Dictionary(corpus)
        corpus_bow = [self.dictionary.doc2bow(text) for text in corpus]
        os.makedirs("ldamodels/"+model_name, exist_ok=True )
        self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs)

    def save_model(self):
        self.model.save("ldamodels/"+self.model_name+"/model.model")
        self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict")

    def load_model(self, model_name):
        self.model_name = model_name
        self.dictionary  = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict")
        self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model")
        self.model.mallet_path = self.mallet_path
    
    def doc_topics(self, doc_idx):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        return self.doc_retriever.doc_topics(doc_idx)    
    
    def ext_doc_topics(self, ext_doc):
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        doc_topics.sort(key=lambda x: x[1], reverse=True)
        return doc_topics

    def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'):
        if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
        doc_bow = self.dictionary.doc2bow(ext_doc)
        doc_topics = self.model[doc_bow]
        topics = []
        for topic in doc_topics:
            topics.append(topic[1])    
        most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric)    
        return most_similar

    def n_most_representative(self, topic, n=3):
         if(not hasattr(self, 'doc_retriever')):
            self.doc_retriever =  DocumentRetriever(self.model.fdoctopics())
         topics = np.zeros(self.model.num_topics)
         topics[topic]=1
         most_similar = self.doc_retriever.n_most_similar(topics, n=n)
         return most_similar
        
    def get_string_topics(self, num_topics=-1, num_words=10):
        if(num_topics==-1):
            num_topics = self.model.num_topics 
        string_topics = []
        for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words):
            splitted = topic[1].split("\"")
            result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))]
            string_topics.append(" ".join(result))
        return string_topics    
Esempio n. 2
0
    def mallet_lda(self, num):

        id2word = corpora.Dictionary(self.data['token'])
        texts = self.data['token']
        corpus = [id2word.doc2bow(text) for text in texts]
        os.environ['Mallet_HOME'] = 'C:\\Mallet'
        mallet_path = 'C:\\Mallet\\bin\\mallet'
        ldamallet = LdaMallet(mallet_path,
                              corpus=corpus,
                              num_topics=num,
                              id2word=id2word)
        return ldamallet.print_topics(num, num_words=6)
Esempio n. 3
0
def lda(bow, df, vocab):
    # Generate and load corpus
    corpus = text_to_corpus(bow)
    corpus = np.load('corpus.npy')

    path_to_mallet = './mallet-2.0.8/bin/mallet'
    model = LdaMallet(path_to_mallet,
                      corpus=corpus,
                      num_topics=5,
                      workers=4,
                      id2word=vocab)
    res = model.print_topics(num_topics=-1, num_words=50)

    # print response
    for x in res:
        print(x)
    for x in model[corpus]:
        print(x)
dictionary.filter_extremes(no_above=0.5)

# Convert to document term matrix (corpus)
doc_term_mat_train = [dictionary.doc2bow(doc) for doc in docs_train]
doc_term_mat_test = [dictionary.doc2bow(doc) for doc in docs_test]

path_to_mallet_binary = r'C:\mallet\bin\mallet'
if __name__ == "__main__":
    model = LdaMallet(path_to_mallet_binary,
                      corpus=doc_term_mat_train,
                      alpha=5,
                      num_topics=10,
                      id2word=dictionary,
                      optimize_interval=50)

    topics = model.print_topics()
    for topic in topics:
        print(topic)

    # Compute Coherence Score for base model
    coherence_model_lda = CoherenceModel(model=model,
                                         corpus=doc_term_mat_train,
                                         texts=docs_train,
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    gensim_model = ldamallet.malletmodel2ldamodel(model)
    # Visualize the topics
    vis_prepared = pyLDAvis.gensim.prepare(gensim_model, doc_term_mat_train,
                                           dictionary)
    pyLDAvis.save_html(vis_prepared, "mallet.html")
Esempio n. 5
0
    # Create the vocabulary
    for ii in files:
        doc_scanner.scan(tokenize_file(ii))

    # Initialize the documents
    docs = doc_scanner.docs
    dictionary = Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # start = time.time()
    # gensim_lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=args.num_topics, iterations=args.num_iterations)
    # time_took = time.time() - start
    # report(gensim_lda.print_topics(num_topics=10, num_words=50), filename="gensim", limit=50)
    # print(("Total time it took: %0.5f seconds" % (time_took)))

    mallet_file = "/home/jihwangk/Desktop/GitDir/Mallet/bin/mallet"
    # start = time.time()
    mallet_lda = LdaMallet(mallet_file,
                           corpus=corpus,
                           num_topics=args.num_topics,
                           id2word=dictionary,
                           iterations=args.num_iterations)
    # time_took = time.time() - start
    mallet_lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
        mallet_lda, iterations=args.num_iterations)
    report(mallet_lda.print_topics(num_topics=10, num_words=50),
           filename="mallet",
           limit=50)
    # print(("Total time it took: %0.5f seconds" % (time_took)))
Esempio n. 6
0

# %% topic model estimation
"""
I focus on two models:
    - 8 topics, ~ local optimum
    - 30 topic, ~ global optimum
"""

# model with 8 topics
# --+ estimate model
lda_8 = LdaMallet(
    mallet_path, corpus=corpus, id2word=dictionary, num_topics=8, random_seed=123
)
# --+ print topics (20 words per topic)
lda_8.print_topics(num_topics=8, num_words=20)
# --+ translate topic modeling outcome
lda_8 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_8)

# --+ term-to-topic probabilities (10 words per topic)
top_terms_line = lda_8.show_topics(num_topics=8, num_words=10)
# ----+ rearrange data on top 10 terms per topic
top_terms_m = []
for i in top_terms_line:
    topic_num = i[0]
    prob_terms = i[1].split("+")
    for term_sort, term in enumerate(prob_terms):
        weight = float(term.split("*")[0])
        term = term.split("*")[1].strip('"| ')
        top_terms_m.append([topic_num, term_sort, weight, term])
df = pd.DataFrame(top_terms_m)
Esempio n. 7
0
def build_lda_model(CIKs, num_topics, ngram_num):
    documents = []
    lda_model = None
    dct = None
    corpus = None

    main_path = dirname(realpath(__file__)) + "/data/14d9"
    for CIK in CIKs:
        files = [
            f for f in listdir(main_path + '/' + CIK)
            if isfile(join(main_path + '/' + CIK, f))
        ]
        for file in files:
            try:
                with open(main_path + '/' + CIK + '/' + file,
                          "r",
                          encoding="latin-1") as f:
                    for row in f:
                        document = [
                            word for word in row.split(" ") if len(word) > 2
                        ]
                        documents.append(document)
            except IOError as e:
                print("Couldn't open file (%s)." % e)

    # Add bigram, trigrams, and quadgrams
    bigram = Phrases(documents)
    documents = [bigram[line] for line in documents]
    trigram = Phrases(documents)
    documents = [trigram[line] for line in documents]
    quadgram = Phrases(documents)
    documents = [quadgram[line] for line in documents]
    documents = list(
        map(
            lambda document: list(
                filter(lambda word: word.count('_') ==
                       (ngram_num - 1), document)), documents))

    # Dictionary
    dct = corpora.Dictionary(documents)

    # Corpus
    corpus = [dct.doc2bow(line) for line in documents]

    environ['MALLET_HOME'] = dirname(realpath(__file__)) + '/mallet-2.0.8/'
    mallet_path = dirname(realpath(__file__)) + "/mallet-2.0.8/bin/mallet"
    lda_mallet = LdaMallet(mallet_path,
                           corpus=corpus,
                           num_topics=num_topics,
                           id2word=dct,
                           iterations=ITERATIONS)

    # Show Topics
    print("LDA Model MALLET")
    for idx in range(num_topics):
        print("Topic #%s-" % idx, lda_mallet.print_topic(idx, 10))

    # Format topic and percentage for api export
    formatted_topics = []
    for _, topic_str in lda_mallet.print_topics():
        current_topic = []
        for percent_topic in topic_str.split(' + '):
            percent, term = percent_topic.split('*')
            current_topic.append({
                'weight': float(percent) * 1000,
                'term': term[1:-1]
            })
        formatted_topics.append(current_topic)

    # Create df for analytics over topics
    df_dominant_topic, df_representative_topic = create_topic_analytics(
        lda_mallet, corpus, documents)

    return formatted_topics, df_dominant_topic.to_dict(
        'records'), df_representative_topic.to_dict('records')
'''
I focus on two models:
    - 8 topics, ~ local optimum
    - 30 topic, ~ global optimum
'''

# model with 9 topics
# --+ estimate model
lda_9 = LdaMallet(mallet_path,
                  corpus=corpus,
                  id2word=dictionary,
                  num_topics=9,
                  random_seed=123)

# --+ print topics (20 words per topic)
lda_9.print_topics(num_topics=9, num_words=20)
# --+ translate topic modeling outcome
lda_9 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_9)
# --+ term-to-topic probabilities (10 words per topic)
top_terms_line = lda_9.show_topics(num_topics=9, num_words=10)

# ----+ rearrange data on top 10 terms per topic
top_terms_m = []
for i in top_terms_line:
    topic_num = i[0]
    prob_terms = i[1].split('+')
    for term_sort, term in enumerate(prob_terms):
        weight = float(term.split('*')[0])
        term = term.split('*')[1].strip('"| ')
        top_terms_m.append([topic_num, term_sort, weight, term])
df = pd.DataFrame(top_terms_m)