Ejemplo n.º 1
0
def train_lda():
	"""
	Usage: python Wechat_LDA.py wechat.csv
	"""
	with open(sys.argv[1], 'r') as wx:
		for f in wx:
			seg = jieba.cut(f)
			seg = [word for word in seg if word not in stopwords]
			with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg:
				wx_seg.write(' '.join(seg))

	documents = open('wechat_seg.txt', 'r')
	dictionary = corpora.Dictionary(LineSentence(documents))
	corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)]
	tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True)
	tfidf_model.save('wechat_seg.txt.tfidf_model')
	# corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus])
	lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1)
	lda_model.save('wechat_lda_model.pkl')

	topics = []
	for doc in corpus:
		topics.append(lda_model[doc])

	counts = np.zeros(100)
	for top_doc in topics:
		for ti, _ in top_doc:
			counts[ti] += 1

	words = lda_model.show_topic(counts.argmax(), 64)
	with open('top_words.txt', 'w') as tw:
		writer = UnicodeWriter(tw)
		for w in words:
			writer.writerow((w[0], int(float(w[1])*1000)))
Ejemplo n.º 2
0
    def createLDA(self, fileName = '', modelName= '', ldaPasses='', topicNum=''):
        '''
        fileName -> file for the dictionary (.dict) and corpus (.mm) files 
        modelName -> model name for LDA to save to disk
        ldaPasses -~ number of passes, 10 default
        topicNum -> number of topics to generate, 100 by default
        '''
        if fileName == '':
            fileName = self.__fileName
            
        if ldaPasses == '':
            ldaPasses = self.__ldaPasses
    
        if topicNum == '':
            topicNum = self.__topicNum

        if modelName == '':
            modelName = fileName + '_' + str(ldaPasses) + 'P_' + str(topicNum) + 'T'
        
        dict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
        mm = corpora.MmCorpus(self.__destination+fileName+'.mm')
        
        #lda = models.ldamodel.LdaModel(corpus=mm, id2word=dict, num_topics=6, update_every=1, chunksize=10000, passes=10)
        lda = LdaMulticore(corpus=mm, num_topics=topicNum, id2word=dict, chunksize=30000, passes=ldaPasses, workers=3)
        lda.save(self.__destination+modelName+'.lda')
        #=======================================================================
        # print lda
        #=======================================================================
        print 'Created LDA model %s'%self.__fileName 
Ejemplo n.º 3
0
    def _build_lda(self, name, corpus, num_topics=30, words_to_save=200, multicore=True):
        from gensim.models import LdaMulticore, LdaModel

        gdict = self.gensim_dictionary

        if multicore:
            lda = LdaMulticore(corpus=corpus,
                               num_topics=num_topics,
                               workers=3,
                               id2word=gdict)
        else:
            lda = LdaModel(corpus=corpus,
                               num_topics=num_topics,
                               id2word=gdict)

        model = TopicModel(name=name, dictionary=self)
        model.save()

        topics = []
        for i in range(num_topics):
            topic = lda.show_topic(i, topn=words_to_save)
            alpha = lda.alpha[i]

            topicm = Topic(model=model, name="?", alpha=alpha, index=i)
            topicm.save()
            topics.append(topicm)

            words = []
            for prob, word_text in topic:
                word_index = gdict.token2id[word_text]
                word_id = self.get_word_id(word_index)
                tw = TopicWord(topic=topicm,
                               word_id=word_id, word_index=word_index,
                               probability=prob)
                words.append(tw)
            TopicWord.objects.bulk_create(words)

            most_likely_word_scores = topicm.word_scores\
                .order_by('-probability')\
                .prefetch_related('word')
                
            topicm.name = ', '.join([score.word.text for score in most_likely_word_scores[:3]])
            topicm.save()

            if settings.DEBUG:
                # prevent memory leaks
                from django.db import connection

                connection.queries = []

        model.save_to_file(lda)

        return (model, lda)
Ejemplo n.º 4
0
    def _build_lda(self, name, corpus, num_topics=30, words_to_save=200):
        from gensim.models import LdaMulticore

        gdict = self.gensim_dictionary

        lda = LdaMulticore(corpus=corpus,
                           num_topics=num_topics,
                           workers=3,
                           id2word=gdict)

        model = TopicModel(name=name, dictionary=self)
        model.save()

        topics = []
        for i in range(num_topics):
            topic = lda.show_topic(i, topn=words_to_save)
            alpha = lda.alpha[i]

            topicm = Topic(model=model, name="?", alpha=alpha, index=i)
            topicm.save()
            topics.append(topicm)

            words = []
            for prob, word_text in topic:
                word_index = gdict.token2id[word_text]
                word_id = self.get_word_id(word_index)
                tw = TopicWord(topic=topicm,
                               word_id=word_id, word_index=word_index,
                               probability=prob)
                words.append(tw)
            TopicWord.objects.bulk_create(words)

            if settings.DEBUG:
                # prevent memory leaks
                from django.db import connection

                connection.queries = []

        model.save_to_file(lda)

        return (model, lda)
Ejemplo n.º 5
0
    def load_from_file(self):
        from gensim.models import LdaMulticore

        return LdaMulticore.load("lda_out_%d.model" % self.id)
Ejemplo n.º 6
0

### Create BOW corpus ###
corpus = [dictionary.doc2bow(text) for text in text_list]

print("--- Corpus made: %s minutes ---" % round(((time.time() - start_time)/60),2)) 



start_lda_time = time.time()

#################################
######### Train LDA  ############
#################################

lda_model = LdaMulticore(corpus, num_topics=4, id2word=dictionary, passes=150, workers = 3)
final_topics = lda_model.show_topics()

print("--- LDA trained : %s minutes ---" % round(((time.time() - start_lda_time)/60),2)) 


#################################
##### Display WordCloud #########
#################################
curr_topic = 0
wc = WordCloud(background_color="black", max_words=2000,max_font_size=40, width=120, height=120, random_state=42)
for line in final_topics:
    line = line[1]
    scores = [float(x.split("*")[0]) for x in line.split(" + ")]
    words = [x.split("*")[1] for x in line.split(" + ")]
    freqs = []
from gensim import corpora, models, similarities
import time
import numpy as np
from gensim.models import LdaMulticore as LdaModel
import argparse

if __name__ == '__main__':
    lda_filename = 'lda/samsung.lda'
    corpus = corpora.MmCorpus('lda/samsung1.mm')
    dictionary = corpora.Dictionary.load('lda/samsung1.dict')
    start = time.time()
    lda = LdaModel(corpus,
                   num_topics=10,
                   alpha=1. / 100,
                   eta=.2,
                   chunksize=10000,
                   workers=11,
                   passes=10,
                   decay=0.75,
                   id2word=dictionary)

    print('Saving model')
    end = time.time()
    elapsed = end - start
    print "Time taken for LDA training: ", elapsed, "seconds."
    lda.print_topics()
    lda.save(lda_filename)
    print("lda saved in %s " % lda_filename)
Ejemplo n.º 8
0
bow_corpus = [dict_doc.doc2bow(x) for x in process_doc]
# print(bow_corpus)
# lda_model = LdaMulticore(bow_corpus,num_topics=8,id2word=dict_doc,workers=2,passes=8)
# lda_model = LdaMulticore(bow_corpus)
# print(lda_model)

#for unseen document
unseen_doc = "Javascript add row and calculate multiple rows from html"
bow_vec = dict_doc.doc2bow(preprocess(unseen_doc))
# print(bow_vec)



if __name__ == '__main__':
    multiprocessing.freeze_support() #for windows system
    lda_model = LdaMulticore(corpus=bow_corpus, id2word=dict_doc, num_topics=4)
    # print(lda)
    for idx, topic in lda_model.print_topics(-1):
        print("Topic: {} \nWords: {}".format(idx, topic))
        print("\n")

#     print("*******************************************")
#     for index, score in sorted(lda_model[bow_vec], key=lambda tup: -1 * tup[1]):
#         print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    print((lda_model[bow_vec]))
    output = lda_model[bow_vec]


    b = sorted(output,key = lambda x:x[1],reverse = True)
    print(b[0][0])
Ejemplo n.º 9
0
regex = re.compile("[0-9]")
no_numbers = filter(lambda i: not regex.search(i), clean)
no_numbers = [i for i in clean if not regex.search(i)]

# remove punctuation
no_numbers = [''.join(char for char in s if char not in string.punctuation) for s in no_numbers]

# tokenization of the pre-cleaned text
tokenized_sents = [word_tokenize(i) for i in no_numbers]
print(tokenized_sents)

# TOPIC MODELLING

# create dictionary
dictionary = corpora.Dictionary(tokenized_sents)
print(dictionary)

# create corpus
corpus = [dictionary.doc2bow(word) for word in tokenized_sents]
print(corpus)

# train the LDA Model
lda_model = LdaMulticore(corpus = corpus, id2word = dictionary, num_topics = 8)
lda_model.save('lda_model.model')
lda_model.print_topics(-1)
print(lda_model.print_topics(-1))

# create a LSI topic model
lsi_model = LsiModel(corpus = corpus, id2word = dictionary, num_topics = 8, decay = 0.5)
pprint(lsi_model.print_topics(-1))
Ejemplo n.º 10
0
class GensimLDAVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 num_topics,
                 return_dense=True,
                 max_df=0.5,
                 min_df=5,
                 **lda_params):
        """
        :param num_topics: number of topics for the LDA model
        :param return_dense: transform function returns dense or not
        :param max_df: maximum word documentfrequency. Should be given as
        :param min_df: minimum word documentfrequency. Similar to max_df.
        :param lda_params: parameters for the constructor of
        gensim.model.Ldamulticore
        """
        super().__init__()
        self.lda: LdaMulticore = None
        self.corpus = None
        self.lda_params = lda_params
        self.lda_params["num_topics"] = num_topics
        self.is_dense = return_dense
        self.max_df = max_df
        self.min_df = min_df

    def fit(self, docs):
        """
        :param docs: List of split strings.
        :return: GensimLDAVectorizer
        """
        id2word = Dictionary(docs)
        id2word.filter_extremes(self.min_df, self.max_df)
        self.corpus = [id2word.doc2bow(d) for d in docs]
        self.lda = LdaMulticore(corpus=self.corpus,
                                id2word=id2word,
                                **self.lda_params)
        return self

    def transform(self, docs):
        """
        :param docs: List of split strings.
        :return: numpy.ndarray
        """
        cur_bow = [self.lda.id2word.doc2bow(d) for d in docs]
        lda_bag_of_topics = [self.lda[c] for c in cur_bow]
        num_terms = self.lda.num_topics
        return corpus2dense(lda_bag_of_topics,
                            num_terms).T if self.is_dense else corpus2csc(
                                lda_bag_of_topics, num_terms).T

    def fit_transform(self, docs, y=None, **fit_params):
        return self.fit(docs).transform(docs)

    def evaluate_coherence(self, docs, coherence="c_v"):
        """
        :param docs: List[List[str]]
        :param coherence: one of the coherence methods stated in
        gensim.models.CoherenceModel
        :return: gensim.models.CoherenceModel
        """
        return CoherenceModel(model=self.lda,
                              texts=docs,
                              corpus=self.corpus,
                              coherence=coherence,
                              processes=self.lda_params["workers"])

    def save(self, fname, *args, **kwargs):
        self.lda.save(fname=fname, *args, **kwargs)

    @classmethod
    def load(self,
             fname,
             return_dense=True,
             max_df=0.5,
             min_df=5,
             *args,
             **kwargs):
        lda = LdaMulticore.load(fname, *args, **kwargs)
        lda = LdaMulticore()
        alpha = lda.alpha
        eta = lda.eta
        iterations = lda.iterations
        random_seed = lda.random_state
        workers = lda.workers
        num_topics = lda.num_topics
        return GensimLDAVectorizer(num_topics, alpha, eta, workers, iterations,
                                   return_dense, max_df, min_df, random_seed)
Ejemplo n.º 11
0
news_train = fetch_20newsgroups(subset='train')

# Tokenization and lemmatization
wnl = WordNetLemmatizer()
news_train_lemma = [
    tokenize_lemmatize(article, wnl.lemmatize) for article in news_train.data
]

# Build a genism corpara structure
dict_train = Dictionary(news_train_lemma)
mmCorpus_train = [dict_train.doc2bow(article) for article in news_train_lemma]

# Latent Semantic Analysis
lsi_train = LsiModel(corpus=mmCorpus_train, num_topics=40, id2word=dict_train)

for i in range(40):
    print('topic' + i.__str__() + ' :')
    print(lsi_train.print_topic(i))

# Latent Dirichlet Allocation

lda_train = LdaMulticore(corpus=mmCorpus_train,
                         num_topics=40,
                         id2word=dict_train,
                         workers=5)

for i in range(40):
    print('topic' + i.__str__() + ' :')
    print(lda_train.print_topic(i))
Ejemplo n.º 12
0
def label_documents(documents: List, LVL, SET):
    OPTIMAL_TOPICS = 10
    PREPROCESSINGs = ["lemmed"]
    for PREPROCESSING in PREPROCESSINGs:
        # texts = clean_documents([str(doc) for doc in documents], PREPROCESSING)
        # pickle.dump(texts, open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "wb"))
        texts = pickle.load(
            open("{}_{}_{}_text".format(SET, LVL, PREPROCESSING), "rb"))
        dictionary = Dictionary(texts)
        print('Number of unique tokens: %d' % len(dictionary))
        dictionary.filter_extremes(no_below=20, no_above=0.5)
        print('Number filtered of unique tokens: %d' % len(dictionary))

        logger.info("Clean complete")
        logger.info("Dictionary complete")
        corpus = [dictionary.doc2bow(text) for text in texts]
        logger.info("Corpus complete")
        # model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=1000, num_topics=OPTIMAL_TOPICS, workers=3)

        # model.save("{}_{}_{}_model_cv_coherence_{}".format(LVL, OPTIMAL_TOPICS, PREPROCESSING))

        model = LdaMulticore.load("{}_{}_1000_model_cv_coherence_{}".format(
            LVL, OPTIMAL_TOPICS, PREPROCESSING))
        coherence_model = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=dictionary,
                                         coherence='c_v')
        print("Coherence {}".format(coherence_model.get_coherence()))
        topics = []
        corpus = [dictionary.doc2bow(text) for text in texts]
        for document in corpus:
            topic = model.get_document_topics(document)
            topics.append(topic)

        with open("topics_results_{}_{}_{}.csv".format(SET, LVL,
                                                       PREPROCESSING),
                  "wt",
                  encoding="utf8",
                  newline="") as outf:
            writer = csv.writer(outf)
            for document_topics in topics:
                sorted_topics = sorted(document_topics, key=lambda x: -x[1])
                if sorted_topics:
                    best = [sorted_topics[0][0]]
                else:
                    best = [-1]
                    print("NONE ERROR")
                writer.writerow(best)

        x = model.show_topics(num_topics=OPTIMAL_TOPICS,
                              num_words=10,
                              formatted=True)
        # Below Code Prints Topics and Words
        with open("topics_keywords_{}_{}_{}".format(SET, LVL, PREPROCESSING),
                  "wt",
                  encoding="utf8",
                  newline="") as outf:
            writer = csv.writer(outf)
            for t in x:
                topic = t[0]
                words = [(word_score.split("*")[1].strip()[1:-1],
                          float(word_score.split("*")[0].strip()))
                         for word_score in t[1].split("+")]
                sort_words = sorted(words, key=lambda z: z[1])
                print(str(topic) + " " + str(sort_words))
                words = [w for w, s in sort_words]
                score = [s for w, s in sort_words]
                topics_word_bar(words, score, t[0])
                writer.writerow([topic] + [sort_words])

    return
Ejemplo n.º 13
0
# +
# Fit a single version of the LDA model.
num_topics = 10
chunksize = 5000
passes = 4
iterations = 200
eval_every = 1  # Evaluate convergence at the end

id2word = dictionary.id2token

lda_model = LdaMulticore(
    corpus=tfidf_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='symmetric',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
    workers=4  # Use all four cores
)

top_topics = lda_model.top_topics(tfidf_corpus)
pprint(top_topics)
# -

# Gensim calculates the [intrinsic coherence score](http://qpleple.com/topic-coherence-to-evaluate-topic-models/) for
# each topic. By averaging across all of the topics in the model you can get an average coherence score. Coherence
# is a measure of the strength of the association between words in a topic cluster. It is supposed to be an objective
# way to evaluate the quailty of the topic clusters. Higher scores are better.