Ejemplo n.º 1
0
class LDA_Doc(object):
    def __init__(self, list_of_list_of_words):
        print('Initializing the LDA_Doc instance')
        self.document_word_list = list_of_list_of_words

    def create_dictionary(self):
        print('Creating dictionary of words in list of list of words')
        # Dictionary could be prepared after removing stop words and lemmatizing the words
        self.dict = corpora.Dictionary(self.document_word_list)

    def create_corpus(self):
        print('Creating corpus with id for each document')
        self.corpus = [
            self.dict.doc2bow(line) for line in self.document_word_list
        ]

    def create_lda_model(self):
        print('Initializing lda model')
        self.model = LdaMulticore(corpus=self.corpus,
                                  id2word=self.dict,
                                  random_state=100,
                                  num_topics=20,
                                  passes=10,
                                  chunksize=1000,
                                  batch=False,
                                  alpha='asymmetric',
                                  decay=0.5,
                                  offset=64,
                                  eta=None,
                                  eval_every=0,
                                  iterations=100,
                                  gamma_threshold=0.001,
                                  per_word_topics=True)

    def compute_optimal_number_of_topic(self):
        pass

    def compute_coherence_score(self):
        pass

    def compute_complexity_perplexity(self):
        pass

    def saving_topicsKeywords_to_csv(self, path, collectionName,
                                     docFolderName):
        # Saving LDA model to disk
        # self.model.save(path+'/'+docFolderName+'/'+collectionName+'/lda_model')
        print('Saving LdaModel model topics with top 10 keywords')
        topics_list = []

        for t in range(self.model.num_topics):
            topics_list.append([' ' + x[0] for x in self.model.show_topic(t)])

        with open(path + '/' + docFolderName + '/' + collectionName +
                  '/lda_text.csv',
                  'w',
                  newline='') as out:
            csv_out = csv.writer(out)
            for row in topics_list:
                csv_out.writerow(row)
Ejemplo n.º 2
0
def train_lda():
	"""
	Usage: python Wechat_LDA.py wechat.csv
	"""
	with open(sys.argv[1], 'r') as wx:
		for f in wx:
			seg = jieba.cut(f)
			seg = [word for word in seg if word not in stopwords]
			with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg:
				wx_seg.write(' '.join(seg))

	documents = open('wechat_seg.txt', 'r')
	dictionary = corpora.Dictionary(LineSentence(documents))
	corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)]
	tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True)
	tfidf_model.save('wechat_seg.txt.tfidf_model')
	# corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus])
	lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1)
	lda_model.save('wechat_lda_model.pkl')

	topics = []
	for doc in corpus:
		topics.append(lda_model[doc])

	counts = np.zeros(100)
	for top_doc in topics:
		for ti, _ in top_doc:
			counts[ti] += 1

	words = lda_model.show_topic(counts.argmax(), 64)
	with open('top_words.txt', 'w') as tw:
		writer = UnicodeWriter(tw)
		for w in words:
			writer.writerow((w[0], int(float(w[1])*1000)))
Ejemplo n.º 3
0
    def _build_lda(self,
                   name,
                   corpus,
                   num_topics=30,
                   words_to_save=200,
                   multicore=True):
        from gensim.models import LdaMulticore, LdaModel

        gdict = self.gensim_dictionary

        if multicore:
            lda = LdaMulticore(corpus=corpus,
                               num_topics=num_topics,
                               workers=3,
                               id2word=gdict)
        else:
            lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=gdict)

        model = TopicModel(name=name, dictionary=self)
        model.save()

        topics = []
        for i in range(num_topics):
            topic = lda.show_topic(i, topn=words_to_save)
            alpha = lda.alpha[i]

            topicm = Topic(model=model, name="?", alpha=alpha, index=i)
            topicm.save()
            topics.append(topicm)

            words = []
            for prob, word_text in topic:
                word_index = gdict.token2id[word_text]
                word_id = self.get_word_id(word_index)
                tw = TopicWord(topic=topicm,
                               word_id=word_id,
                               word_index=word_index,
                               probability=prob)
                words.append(tw)
            TopicWord.objects.bulk_create(words)

            most_likely_word_scores = topicm.word_scores\
                .order_by('-probability')\
                .prefetch_related('word')

            topicm.name = ', '.join(
                [score.word.text for score in most_likely_word_scores[:3]])
            topicm.save()

            if settings.DEBUG:
                # prevent memory leaks
                from django.db import connection

                connection.queries = []

        model.save_to_file(lda)

        return (model, lda)
Ejemplo n.º 4
0
    def _build_lda(self, name, corpus, num_topics=30, words_to_save=200, multicore=True):
        from gensim.models import LdaMulticore, LdaModel

        gdict = self.gensim_dictionary

        if multicore:
            lda = LdaMulticore(corpus=corpus,
                               num_topics=num_topics,
                               workers=3,
                               id2word=gdict)
        else:
            lda = LdaModel(corpus=corpus,
                               num_topics=num_topics,
                               id2word=gdict)

        model = TopicModel(name=name, dictionary=self)
        model.save()

        topics = []
        for i in range(num_topics):
            topic = lda.show_topic(i, topn=words_to_save)
            alpha = lda.alpha[i]

            topicm = Topic(model=model, name="?", alpha=alpha, index=i)
            topicm.save()
            topics.append(topicm)

            words = []
            for prob, word_text in topic:
                word_index = gdict.token2id[word_text]
                word_id = self.get_word_id(word_index)
                tw = TopicWord(topic=topicm,
                               word_id=word_id, word_index=word_index,
                               probability=prob)
                words.append(tw)
            TopicWord.objects.bulk_create(words)

            most_likely_word_scores = topicm.word_scores\
                .order_by('-probability')\
                .prefetch_related('word')
                
            topicm.name = ', '.join([score.word.text for score in most_likely_word_scores[:3]])
            topicm.save()

            if settings.DEBUG:
                # prevent memory leaks
                from django.db import connection

                connection.queries = []

        model.save_to_file(lda)

        return (model, lda)
Ejemplo n.º 5
0
    def _build_lda(self, name, corpus, num_topics=30, words_to_save=200):
        from gensim.models import LdaMulticore

        gdict = self.gensim_dictionary

        lda = LdaMulticore(corpus=corpus,
                           num_topics=num_topics,
                           workers=3,
                           id2word=gdict)

        model = TopicModel(name=name, dictionary=self)
        model.save()

        topics = []
        for i in range(num_topics):
            topic = lda.show_topic(i, topn=words_to_save)
            alpha = lda.alpha[i]

            topicm = Topic(model=model, name="?", alpha=alpha, index=i)
            topicm.save()
            topics.append(topicm)

            words = []
            for prob, word_text in topic:
                word_index = gdict.token2id[word_text]
                word_id = self.get_word_id(word_index)
                tw = TopicWord(topic=topicm,
                               word_id=word_id,
                               word_index=word_index,
                               probability=prob)
                words.append(tw)
            TopicWord.objects.bulk_create(words)

            if settings.DEBUG:
                # prevent memory leaks
                from django.db import connection

                connection.queries = []

        model.save_to_file(lda)

        return (model, lda)
Ejemplo n.º 6
0
    def _build_lda(self, name, corpus, num_topics=30, words_to_save=200):
        from gensim.models import LdaMulticore

        gdict = self.gensim_dictionary

        lda = LdaMulticore(corpus=corpus,
                           num_topics=num_topics,
                           workers=3,
                           id2word=gdict)

        model = TopicModel(name=name, dictionary=self)
        model.save()

        topics = []
        for i in range(num_topics):
            topic = lda.show_topic(i, topn=words_to_save)
            alpha = lda.alpha[i]

            topicm = Topic(model=model, name="?", alpha=alpha, index=i)
            topicm.save()
            topics.append(topicm)

            words = []
            for prob, word_text in topic:
                word_index = gdict.token2id[word_text]
                word_id = self.get_word_id(word_index)
                tw = TopicWord(topic=topicm,
                               word_id=word_id, word_index=word_index,
                               probability=prob)
                words.append(tw)
            TopicWord.objects.bulk_create(words)

            if settings.DEBUG:
                # prevent memory leaks
                from django.db import connection

                connection.queries = []

        model.save_to_file(lda)

        return (model, lda)
    print("created corpus")
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(comments_corpus))

    num_topics = 150
    if args.load:
        model = LdaMulticore.load("topic_models/model_comments")
    else:
        model = LdaMulticore(comments_corpus, id2word=dictionary, num_topics=num_topics)
        print("model done")
        model.save("topic_models/model_comments")

    print(model.print_topics(20))

    top_topics = model.top_topics(comments_corpus) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    #from pprint import pprint
    #pprint(top_topics)

    for _ in range(10):
        idx = np.random.randint(0, len(comments_text))

        print("comment: {} - topics: {}".format(comments_text[idx],
                [(model.show_topic(tid, topn=10), v) for tid, v
                in model[comments_corpus[idx]] if v > 0.15]))
Ejemplo n.º 8
0
def train_LDA_model(data, num_topics, CPUs):

    # Pre-processing
    sentences = [nltk.tokenize.sent_tokenize(doc) for doc in data]
    sentences = [val for sublist in sentences for val in sublist]
    data_words = list(sent_to_words(sentences))

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(
        data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # ## Train LDA Model

    # Build LDA model
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=id2word,
                             num_topics=num_topics,
                             random_state=50,
                             chunksize=100,
                             passes=10,
                             per_word_topics=True,
                             workers=CPUs)

    model_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/lda_model_all_years.model'
    lda_model.save(model_dest)

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]

    # Visualize the topics
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    storage_dest_lda_html = lda_data_dir + 'LDA_model/all_years_2007_2017/all_years_2007_2017_local_lda.html'
    pyLDAvis.save_html(vis, storage_dest_lda_html)

    wordcloud_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/wordclouds/'

    for t in range(lda_model.num_topics):
        plt.figure()
        dictionary = {}
        plt.imshow(WordCloud().fit_words(
            Convert(lda_model.show_topic(t, 30), dictionary)))
        plt.axis("off")
        plt.title("Topic_" + str(t))
        plt.show()
        plt.savefig(wordcloud_dest + "Topic #" + str(t) +
                    '.png')  # set location on server

    return lda_model
corpus_topics = [
    sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results
]

# In[ ]:

# Get top  significant terms and their probabilities for each topic using ldamallet
topics = [[(term, round(wt, 3))
           for term, wt in ldamallet.show_topic(n, topn=20)]
          for n in range(0, ldamallet.num_topics)]

# In[ ]:

# Get top  significant terms and their probabilities for each topic using LDA multicore
topics_ldamulticore = [[(term, round(wt, 3))
                        for term, wt in ldamulticore.show_topic(n, topn=20)]
                       for n in range(0, ldamulticore.num_topics)]

# In[ ]:

import pickle
from gensim.models import CoherenceModel
ldamodel = pickle.load(
    open(
        "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamodel_100_QAT.pkl",
        "rb"))
ldamulticore = pickle.load(
    open(
        "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamulticore_100_QAT.pkl",
        "rb"))
def LDA_model_train_out_of_time(df, features, num_topics = 30, subset = True, CPUs = 6):
    
    dest_all_model = dict()
    
    X = df[features]

    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,  validation_split_date = datetime.date(2008,12,31), date_column = 'sec_filing_date')
    k_folds = len(tbcv_folds)
    for k_index, (train_index, test_index) in enumerate(tbcv_folds):
        
        train_years_start = min(X.loc[train_index]['sec_filing_date']).year
        train_years_end = max(X.loc[train_index]['sec_filing_date']).year
        val_year = min(X.loc[test_index]['sec_filing_date']).year
        
        data_train   = X.loc[train_index].drop('sec_filing_date', axis=1)
        data_val   = X.loc[test_index].drop('sec_filing_date', axis=1)
        
        print("=========================================")
        print("==== K Fold Validation step => %d/%d ======" % (k_index+1, k_folds))
        print("=========================================")
        

        start = time.time()
        data_train = data_train.values.tolist() 
        data_train = [val for sublist in data_train for val in sublist]
        id2word_train, texts_train, corpus_train = prepare_LDA_text(data_train, subset = subset)
        end = time.time()
        print("Preparing training text took: " + str(end - start))
        
        start = time.time()
        data_val = data_val.values.tolist() 
        data_val = [val for sublist in data_val for val in sublist]
        id2word_val, texts_val, corpus_val = prepare_LDA_text(data_val, subset = subset)
        end = time.time()
        print("Preparing validation text took: " + str(end - start))
        


        #Train LDA on Training data
        start = time.time()
        lda_model_train = LdaMulticore(corpus = corpus_train,
                                id2word = id2word_train,
                                num_topics = num_topics, 
                                random_state = 50,
                                chunksize = 100,
                                passes = 10,
                                per_word_topics = True,
                                workers = CPUs)
        
        doc_lda_train = lda_model_train[corpus_train]
        
        folder_train = str(train_years_start) + '_' + str(train_years_end)
        if not os.path.exists(lda_data_dir + 'LDA_model/' + folder_train + '/'):
            os.makedirs(lda_data_dir + 'LDA_model/' + folder_train + '/')
        dest_train = lda_data_dir + 'LDA_model/' + folder_train + '/' + 'lda_' + folder_train + '.model'

        lda_model_train.save(main_dir + dest_train)
        end = time.time()
        print("Train LDA on training data took: " + str(end - start))

        with open(lda_data_dir + 'LDA_model/' + folder_train + '/' + 'id2word.pkl', "wb") as fp:
            pickle.dump(id2word_train, fp)
        with open(lda_data_dir + 'LDA_model/' + folder_train + '/' + "texts.txt", "wb") as fp:
            pickle.dump(texts_train, fp)
        with open(lda_data_dir + 'LDA_model/' + folder_train + '/' + "corpus.txt", "wb") as fp:
            pickle.dump(corpus_train, fp)
        
        #Train LDA on Validation data
        start = time.time()
        lda_model_val = LdaMulticore(corpus = corpus_val,
                                id2word = id2word_val,
                                num_topics = num_topics, 
                                random_state = 50,
                                chunksize = 100,
                                passes = 10,
                                per_word_topics = True,
                                workers = CPUs)
        
        doc_lda_val = lda_model_val[corpus_val]
        
        folder_val = str(val_year)
        if not os.path.exists(lda_data_dir + 'LDA_model/' + folder_val + '/'):
            os.makedirs(lda_data_dir + 'LDA_model/' + folder_val + '/')
        dest_val = lda_data_dir + 'LDA_model/' + folder_val + '/' + 'lda_' + folder_val + '.model'
        lda_model_val.save(main_dir + dest_val)        
        end = time.time()
        print("Train LDA on validation data took: " + str(end - start))
        
        with open(lda_data_dir + 'LDA_model/' + folder_val + '/' + 'id2word.pkl', "wb") as fp:
            pickle.dump(id2word_val, fp)
        with open(lda_data_dir + 'LDA_model/' + folder_val + '/' + "texts.txt", "wb") as fp:
            pickle.dump(texts_val, fp)
        with open(lda_data_dir + 'LDA_model/' + folder_val + '/' + "corpus.txt", "wb") as fp:
            pickle.dump(corpus_val, fp)
        
        dest_all_model[str(k_index+1)] = (dest_train, dest_val)

        #Create Visualization
        start = time.time()
        pyLDAvis.enable_notebook()
        vis_train = pyLDAvis.gensim.prepare(lda_model_train, corpus_train, id2word_train, sort_topics=False)
        dest_train_vs = lda_data_dir + 'LDA_model/' + folder_train + '/' + 'vis_' + folder_train + '.html'
        pyLDAvis.save_html(vis_train, dest_train_vs)
        end = time.time()
        print("Train LDA visualization took: " + str(end - start))

        
        start = time.time()
        pyLDAvis.enable_notebook()
        vis_val = pyLDAvis.gensim.prepare(lda_model_val, corpus_val, id2word_val, sort_topics=False)
        dest_train_val = lda_data_dir + 'LDA_model/' + folder_val + '/' + 'vis_' + folder_val + '.html'
        pyLDAvis.save_html(vis_val, dest_train_val)
        end = time.time()
        print("Validation LDA visualization took: " + str(end - start))

        # Create Word Clouds
        # Train
        for t in range(lda_model_train.num_topics):
            plt.figure()
            dictionary = {} 
            plt.imshow(WordCloud().fit_words(Convert(lda_model_train.show_topic(t, 30), dictionary)))
            plt.axis("off")
            plt.title("Topic_" + str(t + 1))
            plt.savefig("wordclouds/Topic #" + str(t + 1)+'.png') # set location on server
            plt.close()
        
        dest_train_zip = lda_data_dir + 'LDA_model/' + folder_train + '/' + 'wordclouds_' + folder_train + '.zip'
        zipf = zipfile.ZipFile(dest_train_zip, 'w', zipfile.ZIP_DEFLATED)
        zipdir('wordclouds/', zipf)
        zipf.close()

        # Val
        for t in range(lda_model_val.num_topics):
            plt.figure()
            dictionary = {} 
            plt.imshow(WordCloud().fit_words(Convert(lda_model_val.show_topic(t, 30), dictionary)))
            plt.axis("off")
            plt.title("Topic_" + str(t + 1))
            plt.savefig("wordclouds/Topic #" + str(t + 1) +'.png') # set location on server
            plt.close()
        
        dest_val_zip = lda_data_dir + 'LDA_model/' + folder_val + '/' + 'wordclouds_' + folder_val + '.zip'
        zipf = zipfile.ZipFile(dest_val_zip, 'w', zipfile.ZIP_DEFLATED)
        zipdir('wordclouds/', zipf)
        zipf.close()
        
        # Matching topics
        start = time.time()
        # Using the Cosine distance requires a customization of the 'LdaMulticore.diff' method in the gensim package. To avoid erros, the code uses the Jaccard distance, but this can be changed to Cosine if needed.
        array_distance = create_dist_matrix (dest_train, dest_val, distance='jaccard', num_words=300, normed=True)
        title = 'Matching Topics (' + str(train_years_start) + '-' + str(train_years_end) + ' vs. ' + str(val_year) + ')' + '\n'
        location = lda_data_dir + 'LDA_model/matching_' + str(val_year) + '.png'
        create_h_clustering (array_distance, n_topics = 30, title = title, location = location)
        end = time.time()
        print("Matching topics took: " + str(end - start))

    return dest_all_model