def topics_by_lda(self,
                      tokenized_corpus_path,
                      num_topics=20,
                      num_words=10,
                      max_lines=10000,
                      split="\s+",
                      max_df=100):
        """
        读入经过分词的文件并且对其进行 LDA 训练

        Arguments:
        tokenized_corpus_path -> string -- 经过分词的语料集地址
        num_topics -> integer -- 主题数目
        num_words -> integer -- 主题词数目
        max_lines -> integer -- 每次读入的最大行数
        split -> string -- 文档的词之间的分隔符
        max_df -> integer -- 避免常用词,过滤超过该阈值的词
        """

        # 存放所有语料集信息
        corpus = []

        with open(tokenized_corpus_path, 'r',
                  encoding='utf-8') as tokenized_corpus:

            flag = 0

            for document in tokenized_corpus:

                # 判断是否读取了足够的行数
                if (flag > max_lines):
                    break

                # 将读取到的内容添加到语料集中
                corpus.append(re.split(split, document))

                flag = flag + 1

        # 构建语料集的 BOW 表示
        (vocab, DTM) = self.corpus2dtm(corpus, max_df=max_df)

        # 训练 LDA 模型

        lda = LdaMulticore(matutils.Sparse2Corpus(DTM,
                                                  documents_columns=False),
                           num_topics=num_topics,
                           id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                           workers=4)

        # 打印并且返回主题数据
        topics = lda.show_topics(num_topics=num_topics,
                                 num_words=num_words,
                                 formatted=False,
                                 log=False)

        for ti, topic in enumerate(topics):
            print("Topic", ti, ":", " ".join(word[0] for word in topic[1]))
Beispiel #2
0
class LDA(Pipe):
    """
    LDA (Latent Dirichlet Allocation) model
    for unsupervised topic modeling.

    Takes vectors and returns topic vectors,
    which can be used for clustering.
    """
    input = Pipe.type.vecs
    output = Pipe.type.vecs

    def __init__(self, n_topics=5):
        self.n_topics = n_topics
        self.trained = False

    def __call__(self, vecs):
        """
        Return topic vectors.
        """
        if not self.trained:
            self.train(vecs)
            self.trained = True

        distribs = []
        for distrib in self.m[Scipy2Corpus(vecs)]:
            distribs.append([t[1] for t in distrib])
        distribs = np.array(distribs)
        return distribs

    def train(self, vecs):
        """
        Build the topic model.
        """
        corp = Scipy2Corpus(vecs)
        self.m = LdaMulticore(corp,
                              num_topics=self.n_topics,
                              iterations=1000,
                              workers=3)

    def print_topics(self, vectorizer):
        vocab = vectorizer.vocabulary
        for topic in self.m.show_topics(num_topics=self.n_topics,
                                        num_words=10,
                                        formatted=False):
            print([vocab[int(ix)] for prob, ix in topic])
Beispiel #3
0
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True):

	"""
	Args: 
	num_topics_list = list of number of topics, a model will be fitted for each
	save: indicates whether model should be saved
	Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics
	"""
	topics_dict = {}
	logfile = open(logfilename, 'w')
	for num_topics in num_topics_list:
		
		print('training', num_topics)
		np.random.seed(NUM)

		start_time = time.time()
		model = LdaMulticore(corpus=train_corpus, id2word=id2word,
							 num_topics=num_topics, iterations=iters,
							 eval_every=None, workers=workers,
							 chunksize=chunksize)
		end_time = time.time()

		if save:
			fname = 'data\\orig_' + str(num_topics) + 'topics.lda'
			model.save(fname)

		per_word_bound = model.log_perplexity(test_corpus)
		perplexity = np.exp2(-1.0 * per_word_bound)

		logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n')
		logfile.write('perplexity: ' + str(perplexity) + '\n')
		logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n')

		topics = model.show_topics(num_topics=num_topics, num_words=20)
		topics_dict[str(num_topics)] = topics
		for topic in topics:
			logfile.write('\n\t' + topic.encode('ascii', 'ignore')  + '\n')

	logfile.close()		
	return topics_dict
Beispiel #4
0
class LDA(Pipe):
    """
    LDA (Latent Dirichlet Allocation) model
    for unsupervised topic modeling.

    Takes vectors and returns topic vectors,
    which can be used for clustering.
    """
    input = Pipe.type.vecs
    output = Pipe.type.vecs

    def __init__(self, n_topics=5):
        self.n_topics = n_topics
        self.trained = False

    def __call__(self, vecs):
        """
        Return topic vectors.
        """
        if not self.trained:
            self.train(vecs)
            self.trained = True

        distribs = []
        for distrib in self.m[Scipy2Corpus(vecs)]:
            distribs.append([t[1] for t in distrib])
        distribs = np.array(distribs)
        return distribs

    def train(self, vecs):
        """
        Build the topic model.
        """
        corp = Scipy2Corpus(vecs)
        self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)

    def print_topics(self, vectorizer):
        vocab = vectorizer.vocabulary
        for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False):
            print([vocab[int(ix)] for prob, ix in topic])
Beispiel #5
0
for i in ls.columns:
    print(f'{ls[i].mean(): .4f}')




jv_avg = [(0,0.1793),(1,0.0007),(2,0.0792),(3,0.0382),(4,0.1750),(5,0.0628),(6,0.0770),(7,0.0147),(8,0.3506),(9,0.0226)]
rk_avg = [(0,0.0544),(1,0.0014),(2,0.0610),(3,0.0123),(4,0.2093),(5,0.0467),(6,0.1689),(7,0.0021),(8,0.4197),(9,0.0242)]
ls_avg = [(0,0.1349),(1,0.0009),(2,0.1084),(3,0.0072),(4,0.1119),(5,0.0581),(6,0.1487),(7,0.0282),(8,0.3402),(9,0.0616)]
gensim.matutils.cossim(ls_avg, rk_avg)



from collections import Counter
topics = model.show_topics(formatted=False)
data_flat = [w for w_list in texts for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])       

import matplotlib.colors as mcolors
# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(5, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
Beispiel #6
0
            data_dir = './%s_data'%data
            dictionary = Dictionary.load(os.path.join(data_dir, 'ne_nedf_weighting.dict'))
            bow_news = load_model(os.path.join(data_dir, 'ne8_nedf_%s_weighting.bow')%(topn_concepts))
            dict_id2token = dict(dictionary.items())

            lda = LdaMulticore(bow_news, id2word=dict_id2token, num_topics=n_topics, passes=passes, iterations=iterations,\
                               eval_every=eval_every, workers=workers, random_state=random_state)

            name = 'ne8_nedf_%s_topic%s_passes%s_iteration%s_random%s' % (topn_concepts, n_topics, passes, iterations, random_state)
            result_dir = os.path.join(data_dir, name)
            if not os.path.exists(result_dir):
                os.mkdir(result_dir)

            lda.save(os.path.join(result_dir, 'lda_model'))

            topics = lda.show_topics(num_topics=n_topics, num_words=20, log=False, formatted=False)
            with open(os.path.join(result_dir, 'topics.txt'), 'w', encoding='utf-8') as f:
                for topic in topics:
                    f.write('topic ' + str(topic[0]) + ':\n')
                    for t in topic[1]:
                        f.write(t[0] + ': ' + str(t[1]) + '\n')
                    f.write('\n')

            endtime = datetime.datetime.now()
            duration = (endtime - starttime).seconds
            duration_list.append(duration)
            print('Totol running for ', (endtime - starttime).seconds, ' seconds.')
        print(sum(duration_list)/len(duration_list))


'''
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
else:
    tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True)
    #tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    mm = tfidf[wiki]
    MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000)

logger.info("finished pre-processing, starting LDA %s", program)

lda = LdaMulticore(mm, id2word=dictionary, workers=10, num_topics=ntopics)
lda.save(model_name)
topics = lda.show_topics(num_topics=ntopics, num_words=30)
print(topics)
logger.info("finished LDA %s", program)

toptopics = lda.top_topics(corpus=wiki,
                           dictionary=lda.id2word,
                           coherence='u_mass')
logger.info("top topicsL %s", 'u_mass')
print(toptopics)
neg = carReviews.loc[carReviews.Vader_Rating <= 2.5, ['EntireReview']].sample(3).values
for n in neg:
    print('------>',n[0])

#LDA Topic Modelling

#Approach 1
reviews = carReviews["ReviewTokens"]
dictionary = corpora.Dictionary(reviews)
#Term document frequency
doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews]
#perform LDA
ldamodel = LdaMulticore(corpus= doc_term_matrix, num_topics =8, id2word=dictionary,chunksize=2000, passes=20,per_word_topics=True)

#get highlighted topics
topics = ldamodel.show_topics()
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False)

#show HTML view
pyLDAvis.save_html(lda_display,open("lda_8_topics.html","w"))

pprint(ldamodel.show_topics(formatted=False))

# Calculate coherence score
def compute_coherence_score(lda_model,reviews):
    coherence = CoherenceModel(lda_model,texts = reviews,dictionary = dictionary ,coherence = "c_v")
    return coherence.get_coherence(),coherence.get_coherence_per_topic()

coh_score,coh_by_topic = compute_coherence_score(ldamodel,reviews)
print(coh_by_topic,coh_score)
Beispiel #9
0
class Model():
    """
    LDA (Latent Dirichlet Allocation) model
    for unsupervised topic modeling.

    TO DO:
        - this model has to be rebuilt for each comment section as new comments come in - what's the best way to manage that?

    Notes:
        - tried LDA on individual sentences, doesn't work as well.
    """

    def __init__(self, n_topics=5, verbose=False):
        self.verbose = verbose
        self.n_topics = n_topics
        self.vectr = Vectorizer()

    def train(self, comments):
        """
        Build the topic model from a list of documents (strings).

        Assumes documents have been pre-processed (e.g. stripped of HTML, etc)
        """
        docs = [c.body for c in comments]
        vecs = self.vectr.vectorize(docs, train=True)
        corp = Scipy2Corpus(vecs)
        self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)

        if self.verbose:
            self.print_topics()

    def featurize(self, docs):
        """
        Return topic vectors for documents.
        """
        vecs = self.vectr.vectorize(docs)

        dists = []
        for dist in self.m[Scipy2Corpus(vecs)]:
            dists.append([t[1] for t in dist])
        dists = np.array(dists)
        return dists

    def cluster(self, comments):
        """
        Build clusters out of most likely topics.
        """

        # If no model exists, train it.
        if not hasattr(self, 'm'):
            self.train(comments)

        clusters = [[] for _ in range(self.n_topics)]
        dists = self.featurize([c.body for c in comments])
        for i, comment in enumerate(comments):
            topic = dists[i].argmax()
            clusters[topic].append(comment)

        return clusters

    def identify(self, docs):
        """
        Labels a list of documents with
        their topic and probability for that topic.
        """
        vecs = self.vectr.vectorize(docs)
        dists = self.featurize(docs)
        for i, doc in enumerate(docs):
            topic = dists[i].argmax()
            proba = dists[i][topic]
            yield doc, topic, proba

    def print_topics(self):
        vocab = self.vectr.vocabulary
        for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False):
            print([vocab[int(ix)] for prob, ix in topic])
Beispiel #10
0
        dictionary.id2token[uid] = token

    print type(dictionary), type(corpus)

    #path where dtm file is installed
    dtm_path = "/home/ankit081190/NLP/dtm/dtm/dtm"

    #model = DtmModel(dtm_path, corpus, time_seq, num_topics=1,
    #                 id2word=corpus.dictionary, initialize_lda=True)

    model = LdaMulticore(corpus, num_topics=10, id2word=dictionary)

    model.save("DTModelMultiCore_" + files + ".model")
    #Gives top 25 topics

    tp = model.show_topics(num_topics=25, log=False, formatted=True)
    print model.print_topics(num_topics=25)
    data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
    pyLDAvis.save_html(data, 'index_lda_' + files + '.html')

    cnt = Counter(tp)
    with codecs.open("topicsMultiLDA" + files + ".txt", "w", "utf-8") as f:
        for i, j in cnt:
            print i, j
            f.write("\nFor Topic Number " + str(i) + ":\n" +
                    str(j).decode("utf-8") + "\n")
        f.close()

    #for i, j in cnt:
    #    print "\nFor topic number: " ,i, "\n";
    #    print j.decode("utf-8")
Beispiel #11
0
class Model():
    """
    LDA (Latent Dirichlet Allocation) model
    for unsupervised topic modeling.

    TO DO:
        - this model has to be rebuilt for each comment section as new comments come in - what's the best way to manage that?

    Notes:
        - tried LDA on individual sentences, doesn't work as well.
    """
    def __init__(self, n_topics=5, verbose=False):
        self.verbose = verbose
        self.n_topics = n_topics
        self.vectr = Vectorizer()

    def train(self, comments):
        """
        Build the topic model from a list of documents (strings).

        Assumes documents have been pre-processed (e.g. stripped of HTML, etc)
        """
        docs = [c.body for c in comments]
        vecs = self.vectr.vectorize(docs, train=True)
        corp = Scipy2Corpus(vecs)
        self.m = LdaMulticore(corp,
                              num_topics=self.n_topics,
                              iterations=1000,
                              workers=3)

        if self.verbose:
            self.print_topics()

    def featurize(self, docs):
        """
        Return topic vectors for documents.
        """
        vecs = self.vectr.vectorize(docs)

        dists = []
        for dist in self.m[Scipy2Corpus(vecs)]:
            dists.append([t[1] for t in dist])
        dists = np.array(dists)
        return dists

    def cluster(self, comments):
        """
        Build clusters out of most likely topics.
        """

        # If no model exists, train it.
        if not hasattr(self, 'm'):
            self.train(comments)

        clusters = [[] for _ in range(self.n_topics)]
        dists = self.featurize([c.body for c in comments])
        for i, comment in enumerate(comments):
            topic = dists[i].argmax()
            clusters[topic].append(comment)

        return clusters

    def identify(self, docs):
        """
        Labels a list of documents with
        their topic and probability for that topic.
        """
        vecs = self.vectr.vectorize(docs)
        dists = self.featurize(docs)
        for i, doc in enumerate(docs):
            topic = dists[i].argmax()
            proba = dists[i][topic]
            yield doc, topic, proba

    def print_topics(self):
        vocab = self.vectr.vocabulary
        for topic in self.m.show_topics(num_topics=self.n_topics,
                                        num_words=10,
                                        formatted=False):
            print([vocab[int(ix)] for prob, ix in topic])
Beispiel #12
0
                beta=beta,
                iter=num_iterations)
            print run_id

            output_file = output_file_template.format(run_id=run_id)

            # Train and save
            print 'Training...'
            model = LdaMulticore(corpus,
                                 alpha=alpha,
                                 eta=beta,
                                 passes=50,
                                 id2word=dictionary,
                                 num_topics=num_topics,
                                 iterations=num_iterations)
            # model.save(output_file)
            print 'Done training'

            # Print top 10 words in topics, if desired
            if print_topics:
                topics = model.show_topics(num_topics=4, formatted=False)
                for topic in topics:
                    for tup in topic[1]:
                        print tup[0] + ": " + str(tup[1])
                    print '\n'

            # Evaluate perplexity
            ll = model.log_perplexity(test_corpus)
            print "LL:   " + str(ll)
            print "Perp: " + str(np.exp2(-ll))
Beispiel #13
0
    tfidf = TfidfModel(corpus, id2word=dictionary, dictionary=dictionary)
    filtered_corpus = []
    for doc in corpus:
        res = tfidf[doc]
        res.sort(key=lambda tup: tup[1], reverse=True)
        # take the 1/4 significant words as meaningful.
        meaningful = [t[0] for t in res[0:len(res) / 4]]
        filtered_corpus.append([t for t in doc if t[0] in meaningful])
    corpus = filtered_corpus
    print("TD-IDF finished!")

    # train a new model.
    print("Training model...")
    lda = LdaMulticore(corpus, num_topics=100, id2word=dictionary, passes=1000, iterations=100000)
    print("Model trained!")

    # save the trained model.
    print("Saving model...")
    safe_mkdirs('model')
    lda.save('model/lda_model')
    print("Model saved!")

    # print('\nDocuments and their topics:')
    # for doc in corpus:
    #     print(lda[doc])

    topics = lda.show_topics(num_topics=-1, formatted=False)
    print('Topics and their related words:')
    for topic in topics:
        print(topic)