def main(): doc = get_doc() print('doc len:', len(doc)) train_texts = list(build_texts(doc)) print('train len:', len(train_texts)) bigram = gensim.models.Phrases( train_texts, min_count=10) # for bigram collocation detection stops = set(stopwords.words('english')) # nltk stopwords list train_texts = process_texts(train_texts, bigram, stops) print('bigramed train_texts', len(train_texts)) vocabulary = Dictionary(train_texts) print('vocab size:', len(vocabulary)) # remove extremes vocabulary.filter_extremes( no_below=3, no_above=0.3 ) # remove words in less than 5 documents and more than 50% documents #vocabulary.filter_n_most_frequent(50) # Filter out 1000 most common tokens # filter_tokens(bad_ids=None, good_ids=None) corpus = [vocabulary.doc2bow(text) for text in train_texts] print('corpus size:', len(corpus)) lda = LdaModel(corpus=corpus, id2word=vocabulary, num_topics=10, chunksize=1500, iterations=200, alpha='auto') print( pd.DataFrame([[word for rank, (word, prob) in enumerate(words)] for topic_id, words in lda.show_topics( formatted=False, num_words=6, num_topics=35)]))
def lda(docs, k): """Latent Dirichlet allocation topic model. Uses Gensim's LdaModel after tokenizing using scikit-learn's TfidfVectorizer. Parameters ---------- k : integer Number of topics. """ from gensim.matutils import Sparse2Corpus from gensim.models import LdaModel # Use a scikit-learn vectorizer rather than Gensim's equivalent # for speed and consistency with LSA and k-means. vect = _vectorizer() corpus = vect.fit_transform(fetch(d) for d in docs) corpus = Sparse2Corpus(corpus) model = LdaModel(corpus=corpus, num_topics=k) topics = model.show_topics(formatted=False) vocab = vect.get_feature_names() #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic] return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
def get(self, s, e): # Loading our datas without treatment dataObject = getDatas() data = dataObject.get() dataEpisode = dataObject.getDataEpisode(data, s, e) # preprocess of the words of the episode tokenEpisode = [] tokenEpisode.append( [token for token in self.preprocessEpisode(dataEpisode)]) dictionnaryEpisode = Dictionary(tokenEpisode) # creating our model corpus model_corpus = [] for episode in tokenEpisode: model_corpus.append(dictionnaryEpisode.doc2bow(episode)) # Creating our list of topics with the LDA models topicsList = [] string = "Voici les sujets recurrents pour l'episode " + e + " de la saison " + s topicsList.append(string) lda_model = LdaModel( corpus=model_corpus, id2word=dictionnaryEpisode, num_topics=3 ) # We choose to get only the 3 most significant topics for topic_id, topic_keywords in lda_model.show_topics(formatted=False): string = "=== Pour le sujet au mot cle principal '" + str( lda_model.show_topic(topic_id, topn=1)[0] [0]) + "', les mots clefs representatifs sont ===" topicsList.append(string) # Broswe the keywords of each topic for keyword in topic_keywords: string = "-> " + str(keyword[0]) + " (" + str(keyword[1]) + ")" topicsList.append(string) # Return our list of topics return topicsList
def get_topic(text): np.random.seed(100) nlp = spacy.load('en') my_stop_words = [ u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'get' ] for stopword in my_stop_words: lexeme = nlp.vocab[stopword] lexeme.is_stop = True doc = nlp(text) article = [] texts = [] for w in doc: # if it's not a stop word or punctuation mark, add it to our article! if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num: # we add the lematized version of the word article.append(w.lemma_) texts.append(article) # getting bigrams out of words using gensim bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] # Creating corpus with our words dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(i) for i in texts] # Applying LDA and LSI models lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] topics = [] for i in ldatopics: topics.append(i[0]) tags = nltk.pos_tag(topics) # removing verbs as generally nouns are topics lfinaltopics = [ word for word, pos in tags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ' and pos != 'VBG' and pos != 'JJ' and pos != 'RB' ] ldafinaltopics = list(set(lfinaltopics)) lstopics = [] for i in lsitopics: for j in i: lstopics.append(j) ltags = nltk.pos_tag(lstopics) lsifinaltopics = [ word for word, pos in ltags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ' and pos != 'VBG' and pos != 'RB' and pos != 'JJ' ] # Intersection of results from both models finaltopics = list(set(ldafinaltopics) & set(lsifinaltopics)) final_topics = [] for i in finaltopics: if len(i) >= 2: final_topics.append(i) return final_topics
def ret_top_model(corpus): """ Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the quality of the topic model we produce, we can see what the interpretability of the best topic is and keep evaluating the topic model until this threshold is crossed. Returns: ------- lm: Final evaluated topic model top_topics: ranked topics in decreasing order. List of tuples """ top_topics = [(0, 0)] rounds = 1 high = 0.0 out_lm = None #while top_topics[0][1] < 0.97 and rounds < 2: #0.97 while True: lm = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary, minimum_probability=0) coherence_values = {} for n, topic in lm.show_topics(num_topics=-1, formatted=False): topic = [word for word, _ in topic] cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary, window_size=10) coherence_values[n] = cm.get_coherence() top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True) if high < top_topics[0][1]: high = top_topics[0][1] out_lm = lm print('round ',rounds,':',top_topics[0][1]) if rounds > 2: break rounds+=1 return out_lm, top_topics, high
def topic_analysis(df, nTopics=5, cleanTextCol='cleaned_text'): df[cleanTextCol]=df[cleanTextCol].fillna('') cleandata = df[cleanTextCol].fillna('').apply(lambda x: x.split(' ')) dictionary = corpora.Dictionary(cleandata) tokens = [dictionary.doc2bow(d) for d in cleandata] model = LdaModel(tokens, num_topics=nTopics, id2word=dictionary, update_every=1, chunksize=50, passes=10, per_word_topics=True, alpha='auto') docweights = [model.get_document_topics(t, minimum_probability=0) for t in tokens] doctopics = pd.DataFrame(docweights).apply(lambda x: x.apply(lambda y: y[-1] if y else 0)) doctopics.columns = [f'topic{n+1}' for n in doctopics.columns] doctopics['KeyTopic']=doctopics.apply(lambda y:doctopics.columns[y==y.max()][0], axis=1) # create topicdescribe topics = model.show_topics(num_words=6) keywords = [re.findall(r'\*"(.*?)"',d[1]) for d in topics] weights = [re.findall(r'([\d\.]+)\*', d[1]) for d in topics] kwdf= pd.DataFrame(keywords, columns=[f'keyword_{n}' for n in range(len(keywords[0]))]) wtdf= pd.DataFrame(weights, columns=[f'weight_{n}' for n in range(len(weights[0]))]) topicDescribe = kwdf.merge(wtdf,left_index=True, right_index=True) topicDescribe[sorted(topicDescribe.columns, key=lambda x:x.split('_')[-1])] topicDescribe['KeyTopic'] = [f'topic{n+1}' for n in range(len(topics))] topicDescribe['TopicKeywords'] = [' '.join(k) for k in keywords] topicDescribe['DocCount'] = doctopics['KeyTopic'].value_counts().sort_index().values topicDescribe = topicDescribe[['KeyTopic']+[col for col in topicDescribe.columns if \ col != 'KeyTopic']] doctopics= doctopics.merge(topicDescribe[['KeyTopic','TopicKeywords']], on='KeyTopic', how='left') return doctopics, topicDescribe, model, tokens, dictionary
def ret_top_model(threshold, corpus, dictionary, texts): """ Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the quality of the topic model we produce, we can see what the interpretability of the best topic is and keep evaluating the topic model until this threshold is crossed. Returns: ------- lm: Final evaluated topic model top_topics: ranked topics in decreasing order. List of tuples """ top_topics = [(0, 0)] while top_topics[0][1] < threshold: lm = LdaModel(corpus=corpus, id2word=dictionary) coherence_values = {} for n, topic in lm.show_topics(num_topics=-1, formatted=False): topic = [word for word, _ in topic] cm = CoherenceModel(topics=[topic], texts=texts, dictionary=dictionary, window_size=10) coherence_values[n] = cm.get_coherence() top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True) return lm, top_topics
def word_cloud(self, model: LdaModel, stopwords_path, save_path): with open(stopwords_path, 'r', encoding='utf8') as f: words = f.readlines() stopwords = add_stop_words(words) print('stop words added') word_cloud = PersianWordCloud(only_persian=True, max_words=10, stopwords=stopwords, width=800, height=800, background_color='black', min_font_size=1, max_font_size=300) topics = model.show_topics(formatted=False) for i, topic in enumerate(topics): topic_words = dict(topic[1]) print(topic_words) new = {} for word in topic_words.keys(): reshaped = get_display(arabic_reshaper.reshape(word)) new[reshaped] = topic_words[word] print(new) word_cloud.generate_from_frequencies(new) image = word_cloud.to_image() image.show() s = save_path + '_topic_' + str(i) + '.png' print(s) image.save(s)
def chosen_lda(corpus, dictionary, data, n_topics, alpha=.1, eta=0.01): ''' This function trains a Gensim LDA model on chosen hyperparameters Arguments: ---------- corpus : matrix-format corpus (BOW or TF-IDF) dictionary : corpus-related dictionary data : text data for coherence score computation n_topics : number of desired topics alpha : alpha parameter (from 0 to infinity) eta : beta parameter (from 0 to infinity) Outputs: ---------- lda : trained model ''' lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=35, random_state=100, alpha=alpha, eta=eta) ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)] lda_coherence = CoherenceModel(topics=ldatopics, texts=data, dictionary=dictionary, window_size=10).get_coherence() print(lda_coherence) lda.print_topics(num_topics=n_topics) lda.save('../03_Dump/model') return lda
def do_cluster(obj, query): texts = [article['title'] for article in obj] processor = Processor(query) tokens = [processor.get_tokens(text) for text in texts] dictionary = corpora.Dictionary(tokens) corpus = [dictionary.doc2bow(token) for token in tokens] num_clusters = len(texts) / 5 model = LdaModel(corpus, num_topics=num_clusters, id2word=dictionary, update_every=5, chunksize=10000, passes=50) # size 10 topic_matrix = model.show_topics(formatted=False, num_topics=num_clusters) clusters = [{ "keywords": [str(word) for word, _ in topic[1]], "articles": [] } for topic in topic_matrix] for i, document in enumerate(corpus): topic = np.array(model.get_document_topics(document)) cluster = int(topic[np.argmax(topic[:, 1])][0]) clusters[cluster]['articles'].append(obj[i]) return clusters
def comparison(texts): dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LSI Model output') print(lsimodel.show_topics()) hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) print('hdp model output') print(hdpmodel.show_topics()) ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary) print('LDA Model output') print(ldamodel.show_topics()) pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence() lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence() def evaluate_bar_graph(coherences, indices): assert len(coherences) == len(indices) n = len(coherences) x = np.arange(n) plt.bar(x, coherences, width=0.2, tick_label=indices, align='center') plt.xlabel('Models') plt.ylabel('Coherence Value') plt.show() evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
def LDA_gensim(corpus, num_topics, id2word, passes, iterations, coherence): ''' LATENT DIRICHLET ALLOCATION # Generative model that assumes each document is a mixture of topics, each topic is a mixture of words. ''' print 'Latent Dirichlet Allocation' #id2word = id2word.id2token # make an index to word dictionary lda_model = LdaModel(corpus = corpus, num_topics = num_topics, id2word = id2word, passes = passes, iterations = iterations) lda_topics = lda_model.show_topics(formatted = False) # compute coherence score coherence_model = CoherenceModel(model = lda_model, texts = text_input, dictionary = id2word, coherence = coherence) coherence_lda = coherence_model.get_coherence() print('\nCoherence Score: ', coherence_lda) return lda_model
class LDA(GenericModel): """ Wrapper for Gensim LdaModel and LdaMulticore """ def __init__(self, *args, **kwargs): """ All provided arguments will be passed to LdaModel or LdaMulticore constructors (the latter in case 'workers' is present in keyword arguments) :param args: positional arguments to initialize model with :param kwargs: keyword arguments to pass to model constructor """ if 'workers' in kwargs.keys(): self.__model__ = LdaMulticore(*args, **kwargs) else: self.__model__ = LdaModel(*args, **kwargs) def fit(self, data: Any, *args, **kwargs): # Actually, I think there is no need for this as # we can simply use update() for uninitialized model self.__model__.update(corpus=data, *args, **kwargs) def update(self, data: Any, *args, **kwargs): self.__model__.update(corpus=data, *args, **kwargs) def get_topics(self, docs: Optional[Iterable[Any]] = None, *args, **kwargs): if docs is None: topics = self.__model__.show_topics(formatted=False, *args, **kwargs) else: topics = map( partial(self.__model__.get_document_topics, per_word_topics=True), docs) topics, t_copy, t_copy_1 = tee(topics, 3) ids = map(lambda x: x[0], topics) words = map(lambda x: x[1], t_copy) words = map(lambda x: list(zip(*x))[0], words) scores = map(lambda x: x[1], t_copy_1) scores = map(lambda x: list(zip(*x))[1], scores) topics = zip(ids, zip(words, scores)) return topics
def topicModeling(corpus, dictionary, texts): ldamodel = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary, passes=5) x = ldamodel.show_topics() #show generated topics #---------------------------------------------------------- sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series( [int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = [ 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords' ] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) #-------Generate Visualization------------------------------ pyLDAvis.enable_notebook() topicModel = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) pyLDAvis.save_html( topicModel, '/Users/[email protected]/Documents/projects/PEM/elon.html') pyLDAvis.show(topicModel) return x, sent_topics_df
def runModels(self, number_of_topics, corpus, dictionary, start, end): #do hdp model hdpmodel = HdpModel(corpus=corpus, id2word=dictionary) hdpmodel.print_topics(num_topics=int(number_of_topics), num_words=10) hdptopics = hdpmodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(hdptopics) #add results to total kept in a list # addToResults(result_dict) #output results self.printResults(number_of_topics, hdptopics, 'hdp', start, end) #d lda model ldamodel = LdaModel(corpus=corpus, num_topics=number_of_topics, id2word=dictionary, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) ldamodel.save('lda' + number_of_topics + '.model') ldatopics = ldamodel.show_topics(num_topics=int(number_of_topics)) # result_dict=addTotalTermResults(ldatopics) # addToResults(result_dict) self.printResults(number_of_topics, ldatopics, 'lda', start, end) visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) location = os.path.join(pn, 'topic_model_results') #visualize outputs in html pyLDAvis.save_html( visualisation, os.path.join( location, 'LDA_Visualization' + str(number_of_topics) + "_" + start + "_" + end + '.html'))
def prepare_twitter_data(data_file, type_of_analysis): labels = [] text_fake, text_normal = '', '' df = pd.read_csv(data_file, sep='|', encoding='utf-8', keep_default_na=False) print('removing duplicates') df = utils.remove_duplicates(df) print('getting preprocessed train articles') idx = 0 for key, item in enumerate(df['article_text']): idx += 1 if df['is_fake'].values[key] == 1: text_fake += get_preprocessed_text(item) labels.append('FAKE') else: text_normal += get_preprocessed_text(item) labels.append('NOT_FAKE') if idx % 100 == 0: print('got {} of {} preprocessed train articles'.format(idx, len(df))) print('Finished gathering train text items') train = pd.DataFrame() train['data'] = df[type_of_analysis] train['labels'] = df['is_fake'] # TOPIC MODELLING nlp = German() stop_words = get_stop_words('de') stop_words.append('foto') stop_words.append('⬅') for stopword in stop_words: lexeme = nlp.vocab[stopword] lexeme.is_stop = True texts = get_spacy_corpus(train['data'], nlp, logging=True, topic_modelling=True) bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary) print(ldamodel.show_topics()) return do_create_twitter(train, None)
def BasicLDA(doclist, num_topics): start = time.clock() num_topics = num_topics texts = clean(doclist) print(texts[1]) # frequency = {} # for text in texts: # for token in text: # if token not in frequency: # frequency[token] = 0 # else: # frequency[token] += 1 dictionary = corpora.Dictionary(texts) size_dictionary = len(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=10, iterations=100) topics = [] for i in lda.show_topics(num_topics=-1, num_words=20): print(i) topics.append(i) for i in lda.get_document_topics(corpus): # i是按照词袋中的顺序,每个文档的主题分布 s = str(i) pattern1 = r'\((\d+),' a = re.findall(pattern1, s) print(a) # 匹配出每个文档的包含的主题标签 word_list = [] # 存放当前文档包含的所有的主题 for idx in a: # 取主题号 w = topics[int(idx)] # 取主题词分布 word_list.append(w) # 按照主题标签, 把对应主题的词分布 ,按照顺序存起来 l = [list(k)[1] for k in i] # list(k)[1] 每个主题的取概率 doc2top = {} for num in range(len(l)): doc2top[l[num]] = word_list[num] print(doc2top) break # print(list(chain.from_iterable(zip(l, word_list)))) elapsed = time.clock() - start return lda, corpus, dictionary, size_dictionary, elapsed
def build_topic(name,jsoname,contentField="text",usetfidf=False,output=False,plot=True,filetype='csv',sfilename=None): data_JLM=None if sfilename: with open(sfilename) as f:fr_stop = [i.split('\n')[0] for i in f.readlines()] else:fr_stop = get_stop_words('fr') try: with open(jsoname) as json_data: data_JLM = json.load(json_data) docs=data_JLM["tweets"] dictionary = gensim.corpora.Dictionary(docs) corpus=[dictionary.doc2bow(doc) for doc in docs] except: docs=[] corpus=[] dictionary = gensim.corpora.Dictionary(docs) if filetype=='csv':data = pd.read_csv(name,sep=';')[contentField]# else:exit() tagger=treetaggerwrapper.TreeTagger(TAGLANG='fr',TAGDIR="../ouest-france2/TreeTagger") for text in data: try:tags=tagger.tag_text(text) except:continue doc=[] for i in tags: tmp=i.split("\t") if len(tmp)<3 or len(tmp[0])<4:continue if tmp[1] in ["NOM","VER"] and tmp[-1] not in fr_stop and "’" not in tmp[-1] :doc+=[tmp[-1]] if "cantine" in doc and "porc" in doc:docs+=[doc] dic=dictionary.doc2bow(doc,allow_update=True) corpus+=[dic] with open(jsoname, 'w') as outfile: json.dump({"tweets":docs}, outfile) if usetfidf: tfidf = gensim.models.TfidfModel(corpus) corpus=tfidf[corpus] #kmeanTest(docs,10) ldamodel=LdaModel(corpus=corpus, id2word=dictionary,num_topics=5) if output:print(ldamodel.show_topics(num_topics=-1, num_words=10)) if plot: data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) pyLDAvis.show(data)
def topicsLDA_gensim(self, num_topics=10, num_words=10, num_iterations=2000, chunksize=20000, decay=0.5): lda = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, chunksize=chunksize, iterations=num_iterations, alpha='auto', eta='auto', decay=decay) # documents for each topic if self.doc2class: doc_idx = 0 for line in lda[self.corpus]: # get topic with maximum percentage if line: topic_idx = max(line, key=lambda item: item[1])[0] else: # if there is no topic assign a random one topic_idx = random.randint(0, num_topics - 1) # make the dictionary if self.doc2topicLDA_gensim.get( self.doc2class[doc_idx]) is None: self.doc2topicLDA_gensim[self.doc2class[doc_idx]] = {} for i in range(0, num_topics): self.doc2topicLDA_gensim[ self.doc2class[doc_idx]][i] = 0 self.doc2topicLDA_gensim[ self.doc2class[doc_idx]][topic_idx] += 1 doc_idx += 1 print self.doc2topicLDA_gensim # return topics return lda.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
def ret_top_model(): top_topics = [(0, 0)] rounds = 1 high = 0.0 out_lm = None #while top_topics[0][1] < 0.97 and rounds < 2: #0.97 while True: lm = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary) coherence_values = {} for n, topic in lm.show_topics(num_topics=-1, formatted=False): topic = [word for word, _ in topic] cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary, window_size=10) coherence_values[n] = cm.get_coherence() top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True) if high < top_topics[0][1]: high = top_topics[0][1] out_lm = lm print('round ',rounds,':',top_topics[0][1]) if rounds > 2: break rounds+=1 return out_lm, top_topics, high
def test_gm_lda(): test_file_path = "training_data/positive_sent_sample" text_file_path = "training_data/seg_sent_without_label" stop_words_file_path = "filter_words/stop_words" text_file = codecs.open(test_file_path, "r", encoding='utf8') stop_words_file = codecs.open(stop_words_file_path, 'r', encoding='utf8') list = [] stop_words = [] for stop_w in stop_words_file: stop_words.append(stop_w.replace("\r\n", "")) for line in text_file: line = remove_stop_words(line, stop_words) list.append(line.replace(" \r\n", "").split(" ")) ''' vectorizer = CountVectorizer() transformer = TfidfTransformer() tf_idf = transformer.fit_transform(vectorizer.fit_transform(list)) words = vectorizer.get_feature_names() weight = tf_idf.toarray() ''' dic = corpora.Dictionary(list) # 生成文档词典,每一个词与一个索引值对应 corpus = [dic.doc2bow(text) for text in list] # 词频统计,转换为空间向量格式 ids = dic.token2id lda = LdaModel( corpus=corpus, id2word=dic, num_topics=20, alpha='auto', ) result_list = lda.show_topics(num_topics=20, num_words=30) for tup in result_list: print(tup[1]) print("============================")
def find_topic(): """ LdaModel params passes: Number of passes through the entire corpus chunk_size: how many documents to load into memory update_every: number of chunks to process prior to moving onto the M step of EM """ with gzip.open(config['fun2vec']['corpus'], 'rb') as f: words = pickle.load(f) # 辞書作成 dictionary = corpora.Dictionary(words) dictionary.filter_extremes(no_below=30, no_above=0.3) # コーパスを作成 corpus = [dictionary.doc2bow(_words) for _words in words] # corpora.MmCorpus.serialize('cop.mm', corpus) lda = LdaModel(corpus, num_topics=10, chunksize=10000, update_every=2, id2word=dictionary) lda.save(config['topic_model']) pprint(lda.show_topics(num_words=20))
class LdaWord2VecModel: def __init__(self, corpus, w2v_size=100, topics=100, w2v_path='', lda_path=''): """initialize LdaWord2VecModel initialize and train the LdaWord2VecModel according to the args. Args: corpus: the corpus used to train, a sequence of sequence of words. w2v_size: size of word vector, 100 by default. topics: num_topics of LDA model, 100 by default. w2v_path: the path to load or save word vector model, '' by default, which means not loading or saving. lda_path: the path to load or save LDA model, '' by default, which means not loading or saving. """ # 保留类的初始化变量 self.topics = topics self.w2v_size = w2v_size # 训练或载入词向量模型 if w2v_path != '': if os.path.exists(w2v_path): self.w2v_model = Word2Vec.load(w2v_path) else: self.w2v_model = Word2Vec(corpus, size=w2v_size) self.w2v_model.save(w2v_path) else: self.w2v_model = Word2Vec(corpus, size=w2v_size) # 训练或载入LDA模型 if lda_path != '': if os.path.exists(lda_path): self.lda_model = LdaModel.load(lda_path) else: word_dict = Dictionary(corpus) bow_corpus = self.BowCorpus(word_dict, corpus) self.lda_model = LdaModel(bow_corpus, id2word=word_dict, num_topics=topics) self.lda_model.save(lda_path) else: word_dict = Dictionary(corpus) bow_corpus = self.BowCorpus(word_dict, corpus) self.lda_model = LdaModel(bow_corpus, id2word=word_dict, num_topics=topics) # 计算主题向量 topic_bow = self.lda_model.show_topics(num_topics=-1) self.topic_vecs = [] for topic in topic_bow: vec = np.zeros(w2v_size, dtype=float) for word_tuple in topic[1].split(' + '): weight, word = word_tuple.split('*') if word[1:-1] in self.w2v_model.wv: vec += self.w2v_model.wv[word[1:-1]] * float(weight) self.topic_vecs.append(vec) def get_topics(self, topics=10, words=10): return self.lda_model.show_topics(num_topics=topics, num_words=words) def get_topic_vecs(self): return self.topic_vecs def get_word_vecs(self): return self.w2v_model.wv def predict(self, doc): # 计算文档向量 doc_vec = np.array(w2v_size, dtype - float) for sent in doc: for word in sent: if word in self.w2v_model.wv: doc_vec += self.w2v_model.wv[word] # 寻找余弦相似度最大的主题向量 topic = -1 cos_max = 0 for i in range(len(self.topic_vecs)): cos = np.dot(doc_vec, self.topic_vecs[i]) / np.sqrt( sum(doc_vec**2) * sum(self.topic_vec[i]**2)) if cos >= cos_max: topic = i cos_max = cos return topic class BowCorpus: def __init__(self, word_dict, corpus): self.word_dict = word_dict self.corpus = corpus def __iter__(self): for doc in self.corpus: yield self.word_dict.doc2bow(doc)
eval_every=1, chunksize=4000, passes=20, iterations=100, alpha='auto', eta='auto', random_state=42) print("Run for %d topics in %.2f mins" % (num_topics, (perf_counter() - start) / 60)) start = perf_counter() # pprint(lda_model.print_topics(num_topics=num_topics, num_words=num_keywords)) print("\n\n") topics_shown = lda_model.show_topics(num_topics=num_topics, num_words=num_keywords, formatted=False) # print(topics_shown) word_freq = {} word_sum = {} topics_to_save = {} for num, rep in topics_shown: if num not in topics_to_save: topics_to_save[num] = {} for word, freq in rep: if word not in word_freq: word_freq[word] = 0 word_freq[word] = word_freq[word] + 1 if word not in word_sum: word_sum[word] = 0 word_sum[word] = word_sum[word] + freq
def return_suggested_articles(request): """ returns suggested articles based on topic of one currently being viewed Parameters ---------- request : request (flask.Request): The request object Returns ------- JSON of google search queries for articles to read """ # get the requested json for the webpage request_json = request.get_json(silent=True) # get the headline and article headline = request_json['headline'] article = request_json['article'] print('requested json headline and article text') # make into one text file combined_article = headline + '. ' + article # set to 1 for single doc lda, 0 for tfidf do_single_document_LDA = 1 # number of query words to return n_search_words = 5 # can identify ngrams, but slows down performance do_ngrams = 1 ### SINGLE DOC LDA PARAMS # set the number of topics to generate (5 seems to work pretty well) num_lda_topics = 5 # set the number of passes n_passes = 10 # if avoiding repeated words (only relevant if num_lda_topics > 1) do_unique_search_words = 0 print('Downloading stop words') # download stopwords list # if use_bucket: download_blob('debiaser_data', 'sw1k.csv', '/tmp/sw1k.csv') # load stop words into pandas and then into list stop_words = pd.read_csv('/tmp/sw1k.csv') # remove from memory os.remove('/tmp/sw1k.csv') stop_words = stop_words['term'] stop_words = [word for word in stop_words] # # adding some custom words stop_words.append('said') stop_words.append('youre') stop_words.append('mph') stop_words.append('inc') stop_words.append('cov') stop_words.append('jr') stop_words.append('dr') stop_words.append('ads') stop_words.append('cookies') stop_words.append('factset') print('Downloading news organizations from AllSidesMedia') # download all_sides_media list # if use_bucket: download_blob('debiaser_data', 'allsides_final_plus_others_with_domains.csv', '/tmp/allsides_final_plus_others_with_domains.csv') # load domain names into dataframe and then get only names and all_sides = pd.read_csv('/tmp/allsides_final_plus_others_with_domains.csv') # remove from memory os.remove('/tmp/allsides_final_plus_others_with_domains.csv') # get the domain # all_sides_names = all_sides['name'] all_sides_domains = all_sides['domain'] # all_sides_names_domains = pd.concat([all_sides_names,all_sides_domains],axis=1) # get dictionary of entities in article # entity_dict = entity_recognizer(combined_article,nlp) if do_single_document_LDA: print('splitting article into sentences') # break up into sentences combined_article = tokenize.sent_tokenize(combined_article) else: # make into one element list for downstream processing combined_article = [combined_article] print('pre processing article text') # process article article_processed = process_all_articles(combined_article, nlp) print('removing stopwords') # remove stopwords article_processed = remove_stopwords(article_processed, stop_words) # floor for the frequency of words to remove # word_frequency_threshold = 1 # get corpus, dictionary, bag of words # processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow(article_processed, # word_frequency_threshold) if do_single_document_LDA: if do_ngrams: # load bigram trigram quadgram models bigram_mod_fname = '/tmp/bigram_mod.pkl' trigram_mod_fname = '/tmp/trigram_mod.pkl' quadgram_mod_fname = '/tmp/quadgram_mod.pkl' download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname) download_blob('debiaser_data', 'trigram_mod.pkl', trigram_mod_fname) download_blob('debiaser_data', 'quadgram_mod.pkl', quadgram_mod_fname) with open(bigram_mod_fname, 'rb') as pickle_file: bigram_mod = pickle.load(pickle_file) with open(trigram_mod_fname, 'rb') as pickle_file: trigram_mod = pickle.load(pickle_file) with open(quadgram_mod_fname, 'rb') as pickle_file: quadgram_mod = pickle.load(pickle_file) print('FINDING QUADGRAMS') # make up to quad grams article_processed = make_quadgrams(article_processed, bigram_mod, trigram_mod, quadgram_mod) # remove to free memory os.remove(bigram_mod_fname) os.remove(trigram_mod_fname) os.remove(quadgram_mod_fname) print('generating dictionary and bag of words vector...') start = time.process_time() processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow( article_processed) print('TIME FOR GENERATING DICTIONARY AND BOW VECTOR') print(time.process_time() - start) print('generating lda model...') start = time.process_time() # generate the LDA model lda = LdaModel(corpus=bow_corpus, num_topics=num_lda_topics, id2word=processed_dictionary, passes=n_passes) print('TIME FOR GENERATING LDA MODEL') print(time.process_time() - start) # get the topics from the lda model lda_topics = lda.show_topics(formatted=False) # ALL INTERESTING BUT DEPRECATED FOR NOW # WILL FOLLOW SIMPLER APPROACH: # Just take top word in each generated topic # get top words per topic lda_top_topic_words_string, lda_top_topic_words_list = get_lda_top_topic_words( lda_topics, num_lda_topics, do_unique_search_words, n_search_words) # doing tfidf else: # specify file name tfidf_matrix_filename = '/tmp/tfidf_matrix.pkl' # download the tfidf matrix print('DOWNLOADING TFIDF MODEL') download_blob('debiaser_data', 'tfidf_matrix.pkl', tfidf_matrix_filename) with open(tfidf_matrix_filename, 'rb') as pickle_file: tfidf = pickle.load(pickle_file) # remove from memory os.remove(tfidf_matrix_filename) if do_ngrams: # load bigram trigram quadgram models bigram_mod_fname = '/tmp/bigram_mod.pkl' trigram_mod_fname = '/tmp/trigram_mod.pkl' quadgram_mod_fname = '/tmp/quadgram_mod.pkl' download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname) download_blob('debiaser_data', 'trigram_mod.pkl', trigram_mod_fname) download_blob('debiaser_data', 'quadgram_mod.pkl', quadgram_mod_fname) with open(bigram_mod_fname, 'rb') as pickle_file: bigram_mod = pickle.load(pickle_file) with open(trigram_mod_fname, 'rb') as pickle_file: trigram_mod = pickle.load(pickle_file) with open(quadgram_mod_fname, 'rb') as pickle_file: quadgram_mod = pickle.load(pickle_file) # make up to quad grams combined_article = make_quadgrams(combined_article, bigram_mod, trigram_mod, quadgram_mod) # remove to free memory os.remove(bigram_mod_fname) os.remove(trigram_mod_fname) os.remove(quadgram_mod_fname) # download dictionary id2word_fname = '/tmp/id2word.pkl' download_blob('debiaser_data', 'id2word_ec2.pkl', id2word_fname) with open(id2word_fname, 'rb') as pickle_file: processed_dictionary = pickle.load(pickle_file) # remove to free memory os.remove(id2word_fname) print('GENERATING BOW VECTOR FOR ARTICLE') # get bag of words representation bow_corpus_article = [ processed_dictionary.doc2bow(text) for text in combined_article ] print('GETTING TF IDF SCORE') tfidf_vector = tfidf[bow_corpus_article[0]] # sort the tfidf vector tfidf_vector = sorted(tfidf_vector, key=getKey, reverse=True) # if there are fewer words than search words, then just use how many words there are if len(tfidf_vector) < n_search_words: n_search_words = len(tfidf_vector) top_tfidf_values = [ tfidf_vector[i][0] for i in range(0, n_search_words) ] print(top_tfidf_values) top_words_list = [ processed_dictionary[i].replace("_", " ") for i in top_tfidf_values ] top_words_string = ' ' for word in top_words_list: if word not in top_words_string: top_words_string += ' ' + word # get dictionary of google queries queries_dict = {} for domain in all_sides_domains: # if this is single document lda if do_single_document_LDA: query = 'www.news.google.com/search?q=site:' + domain + lda_top_topic_words_string # if this is tfidf else: query = 'www.news.google.com/search?q=site:' + domain + top_words_string queries_dict[domain] = query return json.dumps(queries_dict)
#coherence_model_lda = CoherenceModel(model=lda_model, texts=nps_comment_filtered, dictionary=id2word, coherence='c_v') #coherence_lda = coherence_model_lda.get_coherence() #print('\nCoherence Score: ', coherence_lda) cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS' cloud = WordCloud(stopwords=stop_words, background_color='white', width=2500, height=1800, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) topics = lda_model.show_topics(formatted=False) fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off') plt.subplots_adjust(wspace=0, hspace=0) plt.axis('off') plt.margins(x=0, y=0)
def lda_analysis(input_data, num_topics=3, random_state=1): # treat each set of documents as a separate corpus and find topics? for key, value in input_data.items(): _texts = [] for k, v in input_data[key].items(): _texts.append(' '.join(input_data[key][k]['lemmas'])) texts = [simple_preprocess(doc) for doc in _texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(line) for line in texts] # build lda model: lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=random_state) # get document topic distribution: doc_topic_dist = get_corpus_topics(_texts, lda_model) topic_terms = lda_model.show_topics(num_words=100) # get top words for each topic: topic_term_dict = OrderedDict() rel_terms = [] for topic_dist in topic_terms: topic_id = topic_dist[0] topic_term_dict[topic_id] = {} topic_terms = topic_dist[1] for _split in topic_terms.split('+'): topic_term_prob = _split.split('*')[0] topic_term = str(_split.split('*')[1]).replace('"', '').strip() topic_term_dict[topic_id][topic_term] = float(topic_term_prob) # rel_terms.append(topic_term) summary_sentences = {} sen_ranker = [] # calculate rank for each sentence with respect to each topic: for k, v in input_data[key].items(): sen = k # sen = sen.lower() sen_length = len(sen.split(' ')) sen_id = input_data[key][sen]['doc_id'] if sen_length <= 7: continue sen_topic = [] # compute score for each topic: for topic in range(num_topics): rel_sen_terms = list( set(input_data[key][k]['lemmas']) & set(topic_term_dict[topic].keys())) sen_score = 0 for term in rel_sen_terms: sen_score += topic_term_dict[topic][term] sen_topic.append((topic, sen_score, sen, sen_id)) # select top one from sen_topic and append to sen_ranker: top_sen_topic = sorted(sen_topic, key=lambda x: x[1], reverse=True)[0] sen_ranker.append(top_sen_topic) for _sen in sen_ranker: topic = _sen[0] sen_score = _sen[1] sen = _sen[2] sen_id = _sen[3] input_data[key][sen].update({"LDAscore": sen_score}) input_data[key][sen].update({"lda_topic_id": topic}) return input_data
def topic_model_gensim_lda(col: str, prefix=None, min_topics=19,max_topics=19,step=2) -> None: def trigram_bow_generator(filepath: str): ''' generator function to read docs from a file and yield a bag-of-words representation ''' for doc in LineSentence(filepath): yield trigram_dictionary.doc2bow(doc) if prefix is None: prefix = '' # for topic modeling trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt' print(f'Loading input file {trigram_docs_filepath}') trigram_dictionary_filepath = data_dir_processed / f'{prefix}{col}_trigram_dict_all.dict' trigram_bow_filepath = data_dir_processed / f'{prefix}{col}_trigram_bow_corpus_all.mm' #resp_whytfa_trigram_transformed_docs_all.txt # turn to posix filepaths until gensim supports this # trigram_docs_filepath = trigram_docs_filepath.as_posix() trigram_docs_filepath = trigram_docs_filepath.as_posix() trigram_dictionary_filepath = trigram_dictionary_filepath.as_posix() trigram_bow_filepath = trigram_bow_filepath.as_posix() # TODO - change 1 == 1 lines to overwrite_interim # this is a bit time consuming - make the if statement True # if you want to learn the dictionary yourself. if 1 == 1: trigram_docs = LineSentence(trigram_docs_filepath) # learn the dictionary by iterating over all of the docs trigram_dictionary = Dictionary(trigram_docs) print(trigram_dictionary) #for k, v in trigram_dictionary.iteritems(): # print (f'{k}, {v}') # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=min_absolute_frequency, no_above=max_relative_frequency, keep_n=max_features, ) trigram_dictionary.compactify() print(trigram_dictionary) #for k, v in trigram_dictionary.iteritems(): # print (f'{k}, {v}') if verbose: logger.info(f'Saving trigram dictionary: {trigram_dictionary_filepath} {len(trigram_dictionary)}') trigram_dictionary.save(trigram_dictionary_filepath) # load the finished dictionary from disk if verbose: logger.info(f'Loading trigram dictionary: {trigram_dictionary_filepath}') trigram_dictionary = Dictionary.load(trigram_dictionary_filepath) # this is a bit time consuming - make the if statement True # if you want to build the bag-of-words corpus yourself. if 1 == 1: # generate bag-of-words representations for # all docs and save them as a matrix if verbose: print(f'Saving corpus: {trigram_bow_filepath}') MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_docs_filepath)) # load the finished bag-of-words corpus from disk if verbose: print(f'Loading corpus: {trigram_bow_filepath}') trigram_bow_corpus = MmCorpus(trigram_bow_filepath) num_topics_range = range(min_topics, max_topics + 1, step) #iterations = 2000 #chunksize = 100 # more than the number of docs? passes = 10 # iterations = 400 iterations = 100 # chunksize = len(trigram_bow_corpus) chunksize = 100 # more than the number of docs? eta = 'auto' #eval_every = None # Don't evaluate model perplexity, takes too much time. workers=1 print(f'cpu_count:{cpu_count()}') alpha='auto' if multicore: # for multicore; one fewer than the number of cores workers = cpu_count() - 1 if verbose: print(f'Multiprocessing with {workers} cores (one fewer than the number of cores)') else: # for singnle core; cannot use in multicore alpha = 'auto' # now_str = datetime.now(timezone('US/Pacific')).strftime('%Y-%m-%d-%H-%M-%S') now_str = ''#datetime.now().strftime('%Y-%m-%d-%H-%M-%S') save_dir = data_dir_processed / f'{prefix}{col}_gensim_lda_models_{now_str}' if not save_dir.exists(): save_dir.mkdir(parents=True, exist_ok=True) # save_dir_s3 = f'{data_dir_processed_s3}/{prefix}{col}_gensim_lda_models_{now_str}' # lm_list = [] c_v = [] u_mass = [] perp = [] #alg='LDA' alg='Mallet' for num_topics in num_topics_range: if(alg == 'Mallet'): logger.info('Using Mallet...') #try the Mallet implementation ldamallet = LdaMallet(mallet_path, corpus=trigram_bow_corpus, num_topics=num_topics, id2word=trigram_dictionary,workers=workers,iterations=iterations) ldamallet_filepath = (save_dir / f'gensim_ldamallet_{num_topics}_topics').as_posix() ldamallet.save(ldamallet_filepath) for t in ldamallet.show_topics(num_topics=-1, num_words=10, formatted=False): words = [w[0] for w in t[1]] logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words))) # Show Topics #print(ldamallet.show_topics(formatted=False)) # Compute Coherence Score cm = CoherenceModel(model=ldamallet, texts=trigram_docs, dictionary=trigram_dictionary, coherence='c_v') c_v.append(cm.get_coherence()) cm = CoherenceModel(model=ldamallet, corpus=trigram_bow_corpus, dictionary=trigram_dictionary, coherence='u_mass')#, processes=workers) u_mass.append(cm.get_coherence()) #perp_lower_bound = ldamallet.log_perplexity(trigram_bow_corpus) #perp.append(2**(-perp_lower_bound)) perp.append(0) else: logger.info('Using LDA...') #TODO: try with and without alpha ldamodel = LdaModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=chunksize, eta=eta, #eval_every=eval_every, alpha=alpha, random_state=np.random.RandomState(seed=10101010), ) #ldamodel = LdaMulticore(corpus=trigram_bow_corpus, id2word=trigram_dictionary, # num_topics=num_topics, passes=passes, iterations=iterations, # chunksize=chunksize, eta=eta, #eval_every=eval_every, # random_state=np.random.RandomState(seed=10101010), # workers=workers # ) ldamodel_filepath = (save_dir / f'gensim_lda_{num_topics}_topics').as_posix() ldamodel.save(ldamodel_filepath) for t in ldamodel.show_topics(num_topics=-1, num_words=50, formatted=False): words = [w[0] for w in t[1]] logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words))) cm = CoherenceModel(model=ldamodel, texts=trigram_docs, dictionary=trigram_dictionary, coherence='c_v')#, processes=workers) c_v.append(cm.get_coherence()) cm = CoherenceModel(model=ldamodel, corpus=trigram_bow_corpus, dictionary=trigram_dictionary, coherence='u_mass') #, processes=workers) u_mass.append(cm.get_coherence()) perp_lower_bound = ldamodel.log_perplexity(trigram_bow_corpus) perp.append(2**(-perp_lower_bound)) coh_perp = pd.DataFrame( data=np.array([c_v, u_mass, perp]).T, columns=['c_v', 'u_mass', 'perp'], index=list(num_topics_range)) coh_perp.index.name = 'num_topics' coh_perp_filepath = save_dir / 'coherence_perplexity.csv' coh_perp.to_csv(coh_perp_filepath) logger.info('coherence_docs={0}, coherence_corpus={1}, perplexity={2}'.format(c_v, u_mass, perp))
def upload_file(): """ Upload csv files and create: * ~/out/corpus.dict * ~/out/corpus.lda * ~/out/corpus.lda.state * ~/out/corpus.mm * ~/out/corpus.mm.index * ~/out/corpus_doclabels.txt * ~/out/corpus_topics.txt * ~/mycorpus.txt As well as (for example): * ~/swcorp/Doyle_AStudyinScarlet.txt * ~/swcorp/Lovecraft_AttheMountainofMadness.txt * etc. """ # INPUT # columns to read from csv file columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity'] # parts-of-speech to include into the model pos_tags = ['ADJ', 'NN', 'V'] # stopwords regex = re.compile('\w+') stopwords = request.files['stoplist'] stopwords = str(stopwords.readlines()) stopwords = regex.findall(stopwords) stopwords.extend(("'", "'d", "'s")) # temporary solution print(stopwords) # document size (in words) doc_size = 1000 # uses the pipeline's ParagraphId to split text into documents, # overrides doc_size - 1: on, 0: off doc_split = 0 # no. of topics to be generated no_of_topics = 30 # no. of lda iterations - usually, the more the better, but # increases computing time no_of_passes = 1 # perplexity estimation every n chunks - # the smaller the better, but increases computing time eval = 1 # documents to process at once chunk = 100 # "symmetric", "asymmetric", "auto", or array # (default: a symmetric 1.0/num_topics prior) affects sparsity of # the document-topic (theta) distribution alpha = "symmetric" # custom alpha may increase topic coherence, but may also produce # more topics with zero probability alpha = np.array([ 0.02, 0.02, # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04, # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02]) # can be a number (int/float), an array, or None # affects topic-word (lambda) distribution - not necessarily # beneficial to topic coherence eta = None # PREPROCESSING files = request.files.getlist('files') docs = [] doc_labels = [] print("\n reading files ...\n") for file in files: file_label = secure_filename(file.filename).split('.')[0] df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE) df = df[columns] df = df.groupby('CPOS') doc = pd.DataFrame() for p in pos_tags: # collect only the specified parts-of-speech doc = doc.append(df.get_group(p)) # construct documents if doc_split: # size according to paragraph id doc = doc.groupby('ParagraphId') for para_id, para in doc: docs.append(para['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(para_id)])) else: # size according to doc_size doc = doc.sort_values(by='TokenId') i = 1 while(doc_size < doc.shape[0]): docs.append( doc[:doc_size]['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(i)])) doc = doc.drop(doc.index[:doc_size]) i += 1 docs.append(doc['Lemma'].values.astype(str)) doc_labels.append(''.join([file_label, " #", str(i)])) if not os.path.exists(os.path.join(os.getcwd(), "swcorp")): os.makedirs(os.path.join(os.getcwd(), "swcorp")) swpath = os.path.join('swcorp', "".join(file_label)) with open(swpath + ".txt", 'w', encoding="utf-8") as text: text.write(" ".join( word for word in doc['Lemma'].values.astype(str) if word not in stopwords)) print("\n normalizing and vectorizing ...\n") # texts = [ # [word for word in doc if word not in stopwords] for doc in docs] print("\n stopwords removed ...\n") print("\n writing mastercorpus ...\n") mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') with open(mastercorpus, 'w', encoding="utf-8") as data: folder = glob.glob("swcorp/*") for text in folder: with open(text, 'r', encoding="utf-8") as text: textline = [re.sub( r'\\n\\r', '', document) for document in ' '.join( text.read().split())] if text != folder[-1]: data.write("".join(textline) + "\n") else: data.write("".join(textline)) # MAIN PART mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') dictionary = corpora.Dictionary( line.lower().split() for line in open( mastercorpus, encoding="utf-8")) class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens # separated by whitespace yield dictionary.doc2bow(line.lower().split()) # corpus = buildCorpus(mastercorpus, dictionary) corpus = MyCorpus() # corpus = glob.glob("swcorpus/*") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): os.makedirs(os.path.join # (os.path.join(os.getcwd(), 'out'), foldername)) MmCorpus.serialize( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus.mm'])), corpus) mm = MmCorpus('out/corpus.mm') print(mm) # doc_labels = glob.glob("corpus/*") print("fitting the model ...\n") model = LdaModel( corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) # model = LdaMulticore(corpus=corpus, id2word=dictionary, # num_topics=no_of_topics, passes=no_of_passes, # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) print(model, "\n") topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): print("topic #"+str(i[0])+": "+str(item)+"\n") print("saving ...\n") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'), # foldername)) with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f: for item in doc_labels: f.write(item + "\n") with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_topics.txt"])), "w", encoding="utf-8") as f: for item, i in zip(topics, enumerate(topics)): f.write( "".join(["topic #", str(i[0]), ": ", str(item), "\n"])) dictionary.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'dict']))) # MmCorpus.serialize( # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( # [foldername, 'mm'])), corpus) model.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'lda']))) print("\n ta-daaaa ...\n") # VISUALIZATION no_of_topics = model.num_topics no_of_docs = len(doc_labels) doc_topic = np.zeros((no_of_docs, no_of_topics)) for doc, i in zip(corpus, range(no_of_docs)): # topic_dist is a list of tuples (topic_id, topic_prob) topic_dist = model.__getitem__(doc) for topic in topic_dist: doc_topic[i][topic[0]] = topic[1] # get plot labels topic_labels = [] for i in range(no_of_topics): # show_topic() returns tuples (word_prob, word) topic_terms = [x[0] for x in model.show_topic(i, topn=3)] topic_labels.append(" ".join(topic_terms)) # cf. https://de.dariah.eu/tatom/topic_model_visualization.html if no_of_docs > 20 or no_of_topics > 20: plt.figure(figsize=(20, 20)) # if many items, enlarge figure plt.pcolor(doc_topic, norm=None, cmap='Reds') plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels) plt.xticks( np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90') plt.gca().invert_yaxis() plt.colorbar(cmap='Reds') plt.tight_layout() plt.savefig("./static/corpus_heatmap.svg") return render_template('success.html')
vocab = Dictionary.load_from_text('./vocab.txt') corpus = UnlabeledCorpus('./rumor_train.csv', vocab) valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab) valid_sentences = [doc for doc in valid_corpus][5000:] # varing number of topics # result = {} # for num_topics in [2, 4, 8, 16, 32, 64]: # best_value = -100 # for i in range(5): # model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics) # likelihood = model.log_perplexity(valid_sentences) # best_value = max(best_value, likelihood) # result[num_topics]= best_value # # for num_topics, likelihood in result.iteritems(): # print 'num_topics: %d, best word_likelihood: %f' % (num_topics, likelihood) model = LdaModel(corpus=corpus, id2word=vocab, num_topics=8, passes=2) model.save('./lda_model.txt') # print topics to a file topics = model.show_topics(num_topics=100, num_words=50) with codecs.open('./topics.txt', 'w', 'utf-8') as out_f: for topic in topics: topic_id, topic_str = topic[0], topic[1] out_f.write('%d:\n%s\n' % (topic_id, topic_str)) out_f.write('\n')
def create_lda_model(): logging.info('about to create all docs from chunks') start_time = datetime.datetime.now() create_all_docs() end_time = datetime.datetime.now() logging.info('total time is: %s', end_time - start_time) logging.info('about to load all docs') with open('./resources/LDA_processing/all_docs.pkl', mode='rb') as f: all_docs = pickle.load(f) logging.info('about to load english words') with open('./resources/LDA_input/english_full_list.txt') as f: english_words = f.read().splitlines() good_english_words = set(english_words[75:21000]) del english_words logging.info('about to remove all stop-words and unknown words') texts = [] for i, doc in enumerate(all_docs): filtered_doc = [word for word in doc if word in good_english_words] texts.append(filtered_doc) if i % 5000 == 0: logging.info('Finished doc: %s', i) logging.info('about to release memory of all_docs and english_words') del all_docs del good_english_words logging.info('about to save texts') with open('./resources/LDA_processing/texts.pkl', mode='wb') as f: pickle.dump(texts, f) logging.info('about to load texts') with open('./resources/LDA_processing/texts.pkl', mode='rb') as f: texts = pickle.load(f) logging.info('about to create dictionary') dictionary = corpora.Dictionary(texts) keys = dictionary.keys() logging.info('dict size before filter: %s', len(keys)) dictionary.filter_extremes(keep_n=150000) dictionary.filter_extremes(no_below=150, no_above=0.05) keys = dictionary.keys() logging.info('dict size after filter: %s', len(keys)) dictionary.save('./resources/LDA_processing/lda.dict') dictionary.save_as_text('./resources/LDA_processing/lda_dict.txt') logging.info('about to create corpus') corpus = [dictionary.doc2bow(text) for text in texts] logging.info('about to save corpus as mm file') corpora.MmCorpus.serialize('./resources/LDA_processing/corpus.mm', corpus) logging.info('about to load dictionary file') dictionary = corpora.Dictionary.load('./resources/LDA_processing/lda.dict') logging.info('about to load corpus as mm file') corpus = corpora.MmCorpus('./resources/LDA_processing/corpus.mm') logging.info('about to start LDA model') lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics) logging.info('finished LDA model') logging.info('about to save ldaModel') lda.save('./resources/LDA_processing/LdaModel') logging.info('about to load ldaModel') lda = LdaModel.load('./resources/LDA_processing/LdaModel') logging.info('about to find topics') topics = lda.show_topics(num_topics=num_topics, num_words=10000, log=True, formatted=False) logging.info('about to save topics') with open('./resources/LDA_processing/topics.pkl', mode='wb') as f: pickle.dump(topics, f) dict_word_sets = find_words_from_lda_model() with open('./resources/LDA_processing/dict_word_sets.pkl', mode='wb') as f: pickle.dump(dict_word_sets, f) topics_words = extract_words_from_word_sets() with open('./resources/LDA_result/topic_words', mode='wt', encoding='utf-8') as f: f.write('\n'.join(topics_words))
MmCorpus.serialize(corpusPath, corpus) mm = MmCorpus(corpusPath) doc_labels = makeDocLabels(path) log.info('fitting the model ...') # fitting the model model = LdaModel(corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) log.info('generated topics...') # print topics topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): log.info('topic #%s: %s', i[0], item) log.info('saving results...') # create output folder if not os.path.exists("out"): os.makedirs("out") # save doc_labels for further use with open(os.path.join(os.path.join(os.getcwd(), "out"),''.join([foldername, "_doclabels.txt"])), "w", encoding="utf-8") as f: for item in doc_labels: f.write(item+"\n") # save topics for further use
# Count words in the 'objective', keeping only those that occur at least 5 times vectorizer = fe.text.CountVectorizer(stop_words='english', min_df=5) X = vectorizer.fit_transform(h2020.objective) # Convert to gensim format corpus = Sparse2Corpus(X, documents_columns=False) # Create mapping from word IDs (integers) to words (strings) id2word = dict(enumerate(vectorizer.get_feature_names())) # Fit LDA model with 10 topics lda = LdaModel(corpus=corpus, num_topics=10, id2word=id2word) # Show top 5 words for each of the 10 topics lda.show_topics(num_topics=10, num_words=5) ''' word2vec using gensim ''' # Convert adjectives and verbs to corresponding lemmas using spaCy objectives = [ \ [ x.lemma_ if x.pos == spacy.parts_of_speech.ADJ or \ x.pos == spacy.parts_of_speech.VERB \ else x.text \ for x in en(text) ] \ for text in h2020.objective ] # Fit word2vec model w2c = Word2Vec(sentences=objectives, size=100, window=5, min_count=5)
dictionary ) pyLDAvis.save_html(p, '../../results/reports/onsite_search_terms_lda_2017_2019_20_topic.html') # The size of the bubble measures the importance of the topics, relative to the data. # # The terms are ordered by saliency (how much the term tells you about the topic). # # The relevance slider can be used to adjust saliency scores. num_topics = 20 censored = [9] [x for x in ldamodel.show_topics(num_topics=num_topics) if x[0] not in censored] # ### Tracking trends over time # # Given a gensim model, label a corpus by topic and plot them over time. How do they change relative to one another? # # Top topics may follow similar trends to global search patterns. Instead, look at "% of searches that are topic". # First, we need to label the training data from tqdm import tqdm tqdm.pandas()
# vamos a utilizar Latent Dirichlet Allocation para tratar de categorizar los abstracts # este se demora la primera q lo corres para entrenar el modelo print("lda") lda_filename = 'model.lda' if not os.path.isfile(lda_filename): lda = LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100) lda.save('/tmp/model.lda') else: lda = LdaModel.load('/tmp/model.lda') lda.show_topics() topics_matrix = lda.show_topics(formatted=False, num_words=7) print(topics_matrix) print(len(topics_matrix)) for topic in topics_matrix: i = topic[1] print([str(word) for word in i]) # # topics_matrix = np.array(topics_matrix) # # topic_words = topics_matrix[:, :, 1] # for i in topic_words: # print([str(word) for word in i])