def lda(): # remove stop words stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines() stopwords = [w.strip() for w in stopwords] fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8') train = [] for line in fp: line = line.split() train.append([w for w in line if w not in stopwords]) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(30) # print topic id=20 lda.print_topic(20) # save/load model lda.save('D:\\nlp\corpora\news.model')
def lda_vector(dataset: list, refer_dictionary=None, refer_lda_model=None): if refer_dictionary is None: refer_docs = [ [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False and token not in [',', '.', '?']] for sample in dataset ] refer_dictionary = Dictionary(refer_docs) refer_doc2bow = [refer_dictionary.doc2bow(text) for text in refer_docs] refer_lda_model = LdaModel(corpus=refer_doc2bow, id2word=refer_dictionary, num_topics=10, dtype=np.float64, passes=10, minimum_probability=0.0) doc = [ [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False and token not in [',', '.', '?']] for sample in dataset ] doc_bow_s = [refer_dictionary.doc2bow(text) for text in doc] doc_vecs = [refer_lda_model[doc_bow] for doc_bow in doc_bow_s] for (sample, doc_vec) in zip(dataset, doc_vecs): for topic_prob in doc_vec: sample['topic'+str(topic_prob[0] + 1)] = topic_prob[1] return refer_dictionary, refer_lda_model
def train_model(dictionary, corpus): chunksize = int(math.ceil(len(corpus) / 1000.0)) * 1000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token model = LdaModel( corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=topics, passes=passes, eval_every=eval_every, per_word_topics=True ) return model
def find_topic(): """ LdaModel params passes: Number of passes through the entire corpus chunk_size: how many documents to load into memory update_every: number of chunks to process prior to moving onto the M step of EM """ with gzip.open(config['fun2vec']['corpus'], 'rb') as f: words = pickle.load(f) # 辞書作成 dictionary = corpora.Dictionary(words) dictionary.filter_extremes(no_below=30, no_above=0.3) # コーパスを作成 corpus = [dictionary.doc2bow(_words) for _words in words] # corpora.MmCorpus.serialize('cop.mm', corpus) lda = LdaModel(corpus, num_topics=10, chunksize=10000, update_every=2, id2word=dictionary) lda.save(config['topic_model']) pprint(lda.show_topics(num_words=20))
def evaluate_graph(dictionary, corpus, texts, limit): """ Function to display num_topics - LDA graph using c_v coherence Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus limit : topic limit Returns: ------- lda_list : List of LDA topic models c_v : Coherence values corresponding to the LDA model with respective number of topics """ c_v = [] lda_list = [] for num_topics in range(1, limit + 1): print("Topic %d" % num_topics) lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) lda_list.append(lm) cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v') # cm = CoherenceModel(model=lm, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v') c_v.append(cm.get_coherence()) # Show graph x = range(1, limit + 1) plt.plot(x, c_v) plt.xlabel("num_topics") plt.ylabel("Coherence score") plt.legend(("c_v"), loc='best') plt.show() return lda_list, c_v
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lda.gz' if not os.path.exists(model_fname) or project.force or force: if corpus: update_every = None # run in batch if we have a pre-supplied corpus else: update_every = 1 model = LdaModel( corpus=corpus, id2word=id2word, alpha=project.alpha, eta=project.eta, chunksize=project.chunksize, passes=project.passes, num_topics=project.num_topics, iterations=project.iterations, eval_every=None, # disable perplexity tests for speed update_every=update_every, ) if corpus: model.save(model_fname) else: model = LdaModel.load(model_fname) return model, model_fname
def LDA(tokens, start, stop, step=1): dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(text) for text in tokens] model_list = [] coherence_values = [] max_topic_num = 0 for i in range(start, stop, step): print('steps ', i) model = LdaModel(corpus, id2word=dictionary, num_topics=i + 1) #LDA model model_list.append(model) coherence_model_lda = CoherenceModel(model, texts=tokens, dictionary=dictionary, coherence='c_v') #Coherence coherence_lda = coherence_model_lda.get_coherence( ) #calculate the coherence score if i is not start and coherence_lda > max(coherence_values): max_topic_num = i coherence_values.append(coherence_lda) x = range(start, stop, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() #show graph of coherence score by pyplot max_ind = coherence_values.index(max(coherence_values)) model_list[max_ind].save("result_model") prepared_data = gensimvis.prepare(model_list[max_ind], corpus=corpus, dictionary=dictionary) pyLDAvis.save_html(prepared_data, 'res.html') #save the result of LDA by html file pyLDAvis.save_json(prepared_data, 'res.json') #save the result of LDA by JSON file return model_list[max_ind], coherence_values[max_ind], max_topic_num
def fit_lda(self): ''' Read in serialized cards from disk. Fit the LdaModel for the class. Only operates on class fields. ''' print('\nRun Gensim LdaModel on serialized documents') start = datetime.now() # Feed params from built_corpus into the LDA model self.lda_model = LdaModel(corpus=self.corpus_cards, num_topics=self.n_topics, id2word=self.built_corpus.vocabulary_, distributed=False, chunksize=2000, passes=10, update_every=1, alpha='symmetric', eta=None, decay=0.7, offset=10.0, eval_every=10, iterations=self.max_iter, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, callbacks=None) end = datetime.now() print(" Time taken: {} on {} topics, max iterations: {}".format( end - start, self.n_topics, self.max_iter)) end = datetime.now() print(" Time taken: {}".format(end - start))
def make_lda_model(self, num_topics=11): ''' Build an optimized LDA model. prints a coherence score for sanity checking (EDA has revealed the target coherence to be ~0.39) ''' print(' - Building LDA Model model with {} topics'.format(num_topics)) dictionary = corpora.Dictionary(self.token_list) corpus = [dictionary.doc2bow(text) for text in self.token_list] #set up mallet path # os.environ.update({'MALLET_HOME':r'anaconda3/lib/python3.7/site-packages/mallet-2.0.8/'}) # mallet_path = '/anaconda3/lib/python3.7/site-packages/mallet-2.0.8/bin/mallet' # update this path # # #Make Model: # ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary) ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=20) #Get Coherence Score: coherence_score = CoherenceModel(model=ldamodel, texts=self.token_list, dictionary=dictionary, coherence='c_v').get_coherence() # model_topics = optimal_model.show_topics(formatted=False) # print topics pp.pprint(ldamodel.print_topics(num_words=6)) print(" - Num Topics: {}. Coherence Value of: {:2.3f}".format( num_topics, coherence_score)) self.all_topics = ldamodel.print_topics(num_words=6) self.ldamodel = ldamodel self.corpus = corpus self.dictionary = dictionary self.coherence_score = coherence_score
def fine_tune_lda(corpus, dictionary, texts, limit, start=2, step=2): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics n_topics : numbmber of topics """ coherence_values = [] model_list = [] n_topics = [] for num_topics in range(start, limit, step): print('\nTraing with n_topics = {}, training sample = {}.'.format(num_topics,len(corpus))) model = LdaModel(corpus = corpus, id2word = dictionary, random_state = 2, alpha='auto', eta = 'auto', num_topics = num_topics)# #distributed = True) # alpha='auto' is not implenented in distributed lda model_list.append(model) print('Calculating coherence score based on {} samples.'.format(len(texts))) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) n_topics.append(num_topics) print("{}: {}".format(num_topics,coherence_values[-1])) return model_list, coherence_values,n_topics
def initiate_LDA(docs=None,no_topics=100): ''' Initiates a ''' finalDict = dictbuild() stringSplit = re.compile('\\s') count =0 stop = stopwords.words('english') for_corpus=[] for line in docs: lines = stringSplit.split(line.lower()) for_corpus.append(lines) finalDict.add_documents([lines]) count+=1 finalDict.filter_tokens(stop) once_ids = [tokenid for tokenid, docfreq in finalDict.dfs.items() if docfreq <30] #5642 finalDict.filter_tokens(once_ids) finalDict.compactify() corpus = [finalDict.doc2bow(line) for line in for_corpus] n=no_topics lda_model = LdaModel(corpus,num_topics=n,id2word=finalDict) topicsDist=[] for x in corpus: topics = lda_model[x] temp=[0]*n for t in topics: temp[t[0]]=t[1] topicsDist.append(temp) print("topics generated") topicsDist = np.asarray(topicsDist) return topicsDist, finalDict,lda_model
def compute_coherence_values(dictionary, corpus, texts, id2word, topics_limit, topics_start, topics_step): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics TODO: add random state to get same results Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] for num_topics in range(topics_start, topics_limit, topics_step): model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, alpha='auto', eta='auto', random_state=203495) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) print('num_topics:', num_topics, 'coherence:', coherencemodel.get_coherence()) return model_list, coherence_values
def fit_lda(documents, corpus, vocab, K: int, alpha: float = None, eta: float = None, eval=False): """ Fit LDA from a scipy CSR matrix (data). :param vocab: a dictionary over the words :param K: number of topics :param alpha: the alpha prior weight of topics in documents :param eta: the eta prior weight of words in topics :return: a lda model trained on the data and vocab """ if eval: logging.basicConfig(filename='logstuff.log', format="%(asctime)s:%(levelname)s:%(message)s", level=logging.NOTSET) perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell') convergence_logger = ConvergenceMetric(logger='shell') coherence_cv_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence='c_v', texts=documents) print("fitting lda...") model = LdaModel(corpus=corpus, id2word=vocab, num_topics=K, eval_every=1, passes=20, iterations=100, random_state=100, update_every=1, callbacks=[convergence_logger, perplexity_logger, coherence_cv_logger]) else: print("fitting lda...") model = LdaMulticore(corpus=corpus, id2word=vocab, num_topics=K, alpha=alpha, eta=eta, passes=20, iterations=100) return model
def evaluate_graph_for_max_label(dictionary, corpus, texts, limit): """ Function to display num_topics - LDA graph using c_v coherence Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus limit : topic limit Returns: ------- lm_list : List of LDA topic models c_v : Coherence values corresponding to the LDA model with respective number of topics """ c_v = [] lm_list = [] warnings.filterwarnings("ignore") for num_topics in range(2, limit): lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) lm_list.append(lm) cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v') c_v.append(cm.get_coherence()) # Show graph x = range(2, limit) m = (i for i in range(len(c_v)) if c_v[i] == max([c_v])) print(max(c_v), m) plt.plot(x, c_v) plt.xlabel("num_topics") plt.ylabel("Coherence score") plt.legend(("c_v"), loc='best') plt.show()
def get_topics(chapters, nb_topics=10, nb_words=10): chapters_token = [[line.split(' ') for line in c] for c in chapters] dictionary = [corpora.Dictionary(c) for c in chapters_token] corpus_chapters = [[dictionary[i].doc2bow(token) for token in c] for i, c in enumerate(chapters_token)] lda_chapters = [LdaModel(corpus_c, num_topics=nb_topics, passes=3) for corpus_c in corpus_chapters] topics = list() for i, lda_c in enumerate(lda_chapters): topics.append([]) for j in range(nb_topics): words = [] # On regarde les 10 mots les plus associés au topic for x in lda_c.show_topic(j, topn=nb_words): words.append(dictionary[i][int(x[0])]) #print(words) topics[i].append(words) return topics
def LDAmodel(words, num_topics=5, num_words=5): """ 1. the number of words 2. the mixture of topics ex: 1/2 the topic “health” and 1/2 the topic “vegetables" etc.. 3. the probability of topic depends on their dominancy """ dictionary = corpora.Dictionary(words) # Term Document Frequency corpus = [dictionary.doc2bow(word) for word in words] # save it! pickle.dump(corpus, open('corpus.pkl', 'wb')) dictionary.save('dictionary.gensim') # Train model ldamodel = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=20) # lda_model = LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True) topics = ldamodel.print_topics(num_topics=num_topics, num_words=num_words) # Validation # A measure of how good the model is. lower the better. val_perplexity = ldamodel.log_perplexity(corpus) # cohherent score coherence_ldamodel = CoherenceModel(model=ldamodel, texts=words, dictionary=dictionary, coherence='c_v') val_coherence = coherence_ldamodel.get_coherence() return topics, val_perplexity, val_coherence
for num_topics in NT: run += 1 print("Run " + str(run) + " out of " + str(runs)) writer.writerows([[ "Data size", "Topics", "no_above", "Chunksize", "Passes", "Iteration" ], [ size, num_topics, no_above, chunksize, passes, iterations ], []]) lda = LdaModel(mm_used, num_topics=num_topics, chunksize=chunksize, id2word=dictionary, passes=passes, iterations=iterations, eval_every=eval_every) lst = [] for topic in LdaModel.print_topics(lda, -1, 10): terms = [ x[0] for x in LdaModel.get_topic_terms( lda, topic[0], topn=10) ] term_strings = [ str(dictionary[term]) for term in terms ] str_topic = [] str_topic.append("Topic " + str(topic[0] + 1))
return quran_MI, quran_Chi, ot_MI, ot_Chi, nt_MI, nt_Chi """ Part 2: Text Analysis ----> execute writing method """ if not os.path.exists('MI_&_ChiSquare.txt'): write_results('MI_&_ChiSquare.txt') """ Part 2: Text Analysis ----> LDA """ common_texts = quran_D + ot_D + nt_D common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=20, random_state=1000, id2word=common_dictionary) """ Part 2: Text Analysis ----> method of finding top 3 average score for each topic and top 10 tokens for the top 3 """ def Top3_and_Top10(corpora_doc_list): all_list = [] for q in corpora_doc_list: each = common_dictionary.doc2bow(q) all_list.append( lda.get_document_topics(bow=each, minimum_probability=0.00)) flatten = itertools.chain.from_iterable score_collection = list(flatten(all_list))
# Log to file (you'll probably want to delete this after) import logging f_path = os.path.join(os.getenv('DATA_PATH'), 'interim', 'onsite_search_gensim.log') logging.basicConfig(filename=f_path, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Running a test first... # Latent Dirichlet Allocation (LDA) model from gensim.models import LdaModel ldamodel = LdaModel( corpus=df_search_terms.corpus.dropna().tolist()[:10], num_topics=5, id2word=dictionary, ) f_path cat '/Volumes/GoogleDrive/My Drive/ga-data-mining/data/interim/onsite_search_gensim.log' # Training on the full dataset (running externally `onsite_search_gensim_lda.py`) # Latent Dirichlet Allocation (LDA) model # from gensim.models import LdaModel
dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) return topic_list, coherence_values # Code starts here topic_list, coherence_value_list = compute_coherence_values( dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, limit=41, start=1, step=5) print(topic_list) max_index = coherence_value_list.index(max(coherence_value_list)) print(max_index) opt_topic = topic_list[max_index] print(opt_topic) lda_model = LdaModel(corpus=doc_term_matrix, num_topics=opt_topic, id2word=dictionary, iterations=10, passes=30, random_state=0) # printing the topics pprint(lda_model.print_topics(5))
# 对分词处理后的文本进行 import codecs from gensim import corpora from gensim.models import LdaModel train = [] # 获取分词的结果 fp = codecs.open('fenci_result.txt', 'r', encoding='utf-8') for line in fp: line = line.split() train.append([w for w in line]) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=20) for topic in lda.print_topics(num_words=2): termNumber = topic[0] print(topic[0], ':', sep='') listOfTerms = topic[1].split('+') for term in listOfTerms: listItems = term.split('*') print(' ', listItems[1], '(', listItems[0], ')', sep='')
import pandas as pd import pickle import gensim from gensim.models import LdaModel # ## Load cleaned text, dictionary, corpus CLEAN_DATA_PATH = '/Users/richardkuzma/coding/analysis/monster/data/cleaned/' jobs_cleaned_filename = 'monster_jobs_cleaned_text.pkl' with open(CLEAN_DATA_PATH + jobs_cleaned_filename, 'rb') as f: jobs_cleaned = pickle.load(f) jobs_dict_filename = 'monster_jobs_dict.pkl' with open(CLEAN_DATA_PATH + jobs_dict_filename, 'rb') as f: dictionary = pickle.load(f) jobs_corpus_filename = 'monster_jobs_corpus.pkl' with open(CLEAN_DATA_PATH + jobs_corpus_filename, 'rb') as f: jobs_corpus = pickle.load(f) """Select number of topics""" num_topics = 20 ### make and save model print('Making LDA model with np version {}'.format(np.__version__)) model = LdaModel(corpus=jobs_corpus, num_topics=num_topics, id2word=dictionary) print('Saving model..') MODEL_PATH = '/Users/richardkuzma/coding/analysis/monster/models/' filename = 'LDA_' + str(num_topics) + '_topics.model' model.save(MODEL_PATH + filename) print('Saved model.\nPath: ' + MODEL_PATH + '\nname: ' + filename)
''' LDA using gensim ''' # Count words in the 'objective', keeping only those that occur at least 5 times vectorizer = fe.text.CountVectorizer(stop_words='english', min_df=5) X = vectorizer.fit_transform(h2020.objective) # Convert to gensim format corpus = Sparse2Corpus(X, documents_columns=False) # Create mapping from word IDs (integers) to words (strings) id2word = dict(enumerate(vectorizer.get_feature_names())) # Fit LDA model with 10 topics lda = LdaModel(corpus=corpus, num_topics=10, id2word=id2word) # Show top 5 words for each of the 10 topics lda.show_topics(num_topics=10, num_words=5) ''' word2vec using gensim ''' # Convert adjectives and verbs to corresponding lemmas using spaCy objectives = [ \ [ x.lemma_ if x.pos == spacy.parts_of_speech.ADJ or \ x.pos == spacy.parts_of_speech.VERB \ else x.text \ for x in en(text) ] \ for text in h2020.objective ]
def topic_mining(self, _active_dataset): """ Internal function for process topic mining and sace trained models @params: _active_dataset - Required : name of active dataset (Str) """ # TODO: перенести сохранение в файлы в слой models print("topic minning start..") vectorizer = TfidfVectorizer(max_df=0.5, max_features=500, min_df=2, stop_words='english', use_idf=True) _cousines, _reviews = self.data_service.get_reviews_for_cousines() print("text uploaded") text = _reviews X = vectorizer.fit_transform(text) print("text transformed") # mapping from feature id to acutal word id2words = {} for i, word in enumerate(vectorizer.get_feature_names()): id2words[i] = word corpus = matutils.Sparse2Corpus(X, documents_columns=False) print("train LDA models") ##################################################################### _model_name = "LDA10" self.modelLDA_10 = LdaModel(corpus, num_topics=10, id2word=id2words) self.model_save(self.modelLDA_10, _model_name, _active_dataset) _cousines2topics = self.modelLDA_10.get_document_topics( corpus, minimum_probability=0) _topics2cousines = [] for i, _topics_weight in enumerate(_cousines2topics): _topics2cousines.append([_cousines[i], _topics_weight]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f: pickle.dump(_topics2cousines, f) _rests_topics = [] _rests = self.data_service.get_rests() for _rest in _rests: _cousines = self.data_service.get_cousines_for_rest(_rest[0]) _rest_vector = [] for _rc in _cousines: for _c in _topics2cousines: if _c[0] == _rc: if not _rest_vector: # _rest_vector = _c[1] for _t, _w in _c[1]: _rest_vector.append([_t, float(_w)]) else: for _t, _w in _c[1]: _rest_vector[_t][1] = (_rest_vector[_t][1] + float(_w)) / 2 _rests_topics.append([_rest, _rest_vector, _cousines]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f: pickle.dump(_rests_topics, f) ##################################################################### _model_name = "LDA15" self.modelLDA_15 = LdaModel(corpus, num_topics=15, id2word=id2words) self.model_save(self.modelLDA_15, _model_name, _active_dataset) _cousines, _reviews = self.data_service.get_reviews_for_cousines() _cousines2topics = self.modelLDA_15.get_document_topics( corpus, minimum_probability=0) _topics2cousines = [] for i, _topics_weight in enumerate(_cousines2topics): _topics2cousines.append([_cousines[i], _topics_weight]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f: pickle.dump(_topics2cousines, f) _rests_topics = [] _rests = self.data_service.get_rests() for _rest in _rests: _cousines = self.data_service.get_cousines_for_rest(_rest[0]) _rest_vector = [] for _rc in _cousines: for _c in _topics2cousines: if _c[0] == _rc: if not _rest_vector: # _rest_vector = _c[1] for _t, _w in _c[1]: _rest_vector.append([_t, float(_w)]) else: for _t, _w in _c[1]: _rest_vector[_t][1] = ( _rest_vector[_t][1] + float(_w)) / 2 _rests_topics.append([_rest, _rest_vector, _cousines]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f: pickle.dump(_rests_topics, f) ##################################################################### _model_name = "LDA20" self.modelLDA_20 = LdaModel(corpus, num_topics=20, id2word=id2words) self.model_save(self.modelLDA_20, _model_name, _active_dataset) _cousines, _reviews = self.data_service.get_reviews_for_cousines() _cousines2topics = self.modelLDA_20.get_document_topics( corpus, minimum_probability=0) _topics2cousines = [] for i, _topics_weight in enumerate(_cousines2topics): _topics2cousines.append([_cousines[i], _topics_weight]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f: pickle.dump(_topics2cousines, f) _rests_topics = [] _rests = self.data_service.get_rests() for _rest in _rests: _cousines = self.data_service.get_cousines_for_rest(_rest[0]) _rest_vector = [] for _rc in _cousines: for _c in _topics2cousines: if _c[0] == _rc: if not _rest_vector: # _rest_vector = _c[1] for _t, _w in _c[1]: _rest_vector.append([_t, float(_w)]) else: for _t, _w in _c[1]: _rest_vector[_t][1] = ( _rest_vector[_t][1] + float(_w)) / 2 _rests_topics.append([_rest, _rest_vector, _cousines]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f: pickle.dump(_rests_topics, f) ################################################################### print("TRAIN MODELS DONE") return self.modelLDA_10
from gensim.models import LdaModel num_topics = 10 chunksize = 1500 passes = 20 iterations = 400 eval_every = None temp = dictionary[0] id2word = dictionary.id2token model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) top_topics = model.top_topics(corpus) from pprint import pprint pprint(top_topics) import pyLDAvis.gensim lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary,
prev_word = word processed_texts.append(processed_text) #MODELO LDA dictionary = Dictionary(processed_texts) corpus = [dictionary.doc2bow(doc) for doc in processed_texts] #ENTRENAMIENTO DEL MODELO num_topics = 1 lda_model = LdaModel( corpus=corpus, id2word=dictionary, num_topics=num_topics, iterations=5, passes=10, alpha='auto' ) #DESCARGA DE DATOS A FICHEROS word_dict = {} today = date.today() today_path = '../data/topic_today_EN.csv' hist_path = '../data/topic_history_EN.csv' for i in range(num_topics): words = lda_model.show_topic(i, topn = 10) word_dict['date'] = today word_dict['Topic'] = [i[0] for i in words]
# for doc in corpus: # for word, freq in doc: # if word not in idf: # idf[word] = 0 # idf[word] = idf[word] + freq # print(sorted([(id2word[wid], fr) for wid, fr in idf.items()], key=lambda x: x[1], reverse=True)) # FINAL LDA MODEL num_topics = 46 num_keywords = 100 lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, update_every=1, eval_every=1, chunksize=4000, passes=20, iterations=100, alpha='auto', eta='auto', random_state=42) print("Run for %d topics in %.2f mins" % (num_topics, (perf_counter() - start) / 60)) start = perf_counter() # pprint(lda_model.print_topics(num_topics=num_topics, num_words=num_keywords)) print("\n\n") topics_shown = lda_model.show_topics(num_topics=num_topics, num_words=num_keywords, formatted=False)
def return_suggested_articles(request): """ returns suggested articles based on topic of one currently being viewed Parameters ---------- request : request (flask.Request): The request object Returns ------- JSON of google search queries for articles to read """ # get the requested json for the webpage request_json = request.get_json(silent=True) # get the headline and article headline = request_json['headline'] article = request_json['article'] print('requested json headline and article text') # make into one text file combined_article = headline + '. ' + article # set to 1 for single doc lda, 0 for tfidf do_single_document_LDA = 1 # number of query words to return n_search_words = 5 # can identify ngrams, but slows down performance do_ngrams = 1 ### SINGLE DOC LDA PARAMS # set the number of topics to generate (5 seems to work pretty well) num_lda_topics = 5 # set the number of passes n_passes = 10 # if avoiding repeated words (only relevant if num_lda_topics > 1) do_unique_search_words = 0 print('Downloading stop words') # download stopwords list # if use_bucket: download_blob('debiaser_data', 'sw1k.csv', '/tmp/sw1k.csv') # load stop words into pandas and then into list stop_words = pd.read_csv('/tmp/sw1k.csv') # remove from memory os.remove('/tmp/sw1k.csv') stop_words = stop_words['term'] stop_words = [word for word in stop_words] # # adding some custom words stop_words.append('said') stop_words.append('youre') stop_words.append('mph') stop_words.append('inc') stop_words.append('cov') stop_words.append('jr') stop_words.append('dr') stop_words.append('ads') stop_words.append('cookies') stop_words.append('factset') print('Downloading news organizations from AllSidesMedia') # download all_sides_media list # if use_bucket: download_blob('debiaser_data', 'allsides_final_plus_others_with_domains.csv', '/tmp/allsides_final_plus_others_with_domains.csv') # load domain names into dataframe and then get only names and all_sides = pd.read_csv('/tmp/allsides_final_plus_others_with_domains.csv') # remove from memory os.remove('/tmp/allsides_final_plus_others_with_domains.csv') # get the domain # all_sides_names = all_sides['name'] all_sides_domains = all_sides['domain'] # all_sides_names_domains = pd.concat([all_sides_names,all_sides_domains],axis=1) # get dictionary of entities in article # entity_dict = entity_recognizer(combined_article,nlp) if do_single_document_LDA: print('splitting article into sentences') # break up into sentences combined_article = tokenize.sent_tokenize(combined_article) else: # make into one element list for downstream processing combined_article = [combined_article] print('pre processing article text') # process article article_processed = process_all_articles(combined_article, nlp) print('removing stopwords') # remove stopwords article_processed = remove_stopwords(article_processed, stop_words) # floor for the frequency of words to remove # word_frequency_threshold = 1 # get corpus, dictionary, bag of words # processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow(article_processed, # word_frequency_threshold) if do_single_document_LDA: if do_ngrams: # load bigram trigram quadgram models bigram_mod_fname = '/tmp/bigram_mod.pkl' trigram_mod_fname = '/tmp/trigram_mod.pkl' quadgram_mod_fname = '/tmp/quadgram_mod.pkl' download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname) download_blob('debiaser_data', 'trigram_mod.pkl', trigram_mod_fname) download_blob('debiaser_data', 'quadgram_mod.pkl', quadgram_mod_fname) with open(bigram_mod_fname, 'rb') as pickle_file: bigram_mod = pickle.load(pickle_file) with open(trigram_mod_fname, 'rb') as pickle_file: trigram_mod = pickle.load(pickle_file) with open(quadgram_mod_fname, 'rb') as pickle_file: quadgram_mod = pickle.load(pickle_file) print('FINDING QUADGRAMS') # make up to quad grams article_processed = make_quadgrams(article_processed, bigram_mod, trigram_mod, quadgram_mod) # remove to free memory os.remove(bigram_mod_fname) os.remove(trigram_mod_fname) os.remove(quadgram_mod_fname) print('generating dictionary and bag of words vector...') start = time.process_time() processed_corpus, processed_dictionary, bow_corpus = get_simple_corpus_dictionary_bow( article_processed) print('TIME FOR GENERATING DICTIONARY AND BOW VECTOR') print(time.process_time() - start) print('generating lda model...') start = time.process_time() # generate the LDA model lda = LdaModel(corpus=bow_corpus, num_topics=num_lda_topics, id2word=processed_dictionary, passes=n_passes) print('TIME FOR GENERATING LDA MODEL') print(time.process_time() - start) # get the topics from the lda model lda_topics = lda.show_topics(formatted=False) # ALL INTERESTING BUT DEPRECATED FOR NOW # WILL FOLLOW SIMPLER APPROACH: # Just take top word in each generated topic # get top words per topic lda_top_topic_words_string, lda_top_topic_words_list = get_lda_top_topic_words( lda_topics, num_lda_topics, do_unique_search_words, n_search_words) # doing tfidf else: # specify file name tfidf_matrix_filename = '/tmp/tfidf_matrix.pkl' # download the tfidf matrix print('DOWNLOADING TFIDF MODEL') download_blob('debiaser_data', 'tfidf_matrix.pkl', tfidf_matrix_filename) with open(tfidf_matrix_filename, 'rb') as pickle_file: tfidf = pickle.load(pickle_file) # remove from memory os.remove(tfidf_matrix_filename) if do_ngrams: # load bigram trigram quadgram models bigram_mod_fname = '/tmp/bigram_mod.pkl' trigram_mod_fname = '/tmp/trigram_mod.pkl' quadgram_mod_fname = '/tmp/quadgram_mod.pkl' download_blob('debiaser_data', 'bigram_mod.pkl', bigram_mod_fname) download_blob('debiaser_data', 'trigram_mod.pkl', trigram_mod_fname) download_blob('debiaser_data', 'quadgram_mod.pkl', quadgram_mod_fname) with open(bigram_mod_fname, 'rb') as pickle_file: bigram_mod = pickle.load(pickle_file) with open(trigram_mod_fname, 'rb') as pickle_file: trigram_mod = pickle.load(pickle_file) with open(quadgram_mod_fname, 'rb') as pickle_file: quadgram_mod = pickle.load(pickle_file) # make up to quad grams combined_article = make_quadgrams(combined_article, bigram_mod, trigram_mod, quadgram_mod) # remove to free memory os.remove(bigram_mod_fname) os.remove(trigram_mod_fname) os.remove(quadgram_mod_fname) # download dictionary id2word_fname = '/tmp/id2word.pkl' download_blob('debiaser_data', 'id2word_ec2.pkl', id2word_fname) with open(id2word_fname, 'rb') as pickle_file: processed_dictionary = pickle.load(pickle_file) # remove to free memory os.remove(id2word_fname) print('GENERATING BOW VECTOR FOR ARTICLE') # get bag of words representation bow_corpus_article = [ processed_dictionary.doc2bow(text) for text in combined_article ] print('GETTING TF IDF SCORE') tfidf_vector = tfidf[bow_corpus_article[0]] # sort the tfidf vector tfidf_vector = sorted(tfidf_vector, key=getKey, reverse=True) # if there are fewer words than search words, then just use how many words there are if len(tfidf_vector) < n_search_words: n_search_words = len(tfidf_vector) top_tfidf_values = [ tfidf_vector[i][0] for i in range(0, n_search_words) ] print(top_tfidf_values) top_words_list = [ processed_dictionary[i].replace("_", " ") for i in top_tfidf_values ] top_words_string = ' ' for word in top_words_list: if word not in top_words_string: top_words_string += ' ' + word # get dictionary of google queries queries_dict = {} for domain in all_sides_domains: # if this is single document lda if do_single_document_LDA: query = 'www.news.google.com/search?q=site:' + domain + lda_top_topic_words_string # if this is tfidf else: query = 'www.news.google.com/search?q=site:' + domain + top_words_string queries_dict[domain] = query return json.dumps(queries_dict)
## Remove unfrequent words dictionary.filter_extremes( no_below=5, no_above=0.75 ) # we use this function to filter words that appear not often (as an integer, here 5) and too often (as a percentage, here in more than 75% [pe]) corpus = [dictionary.doc2bow(preprocess(review)) for review in reviews] #### LDA #### lda_model = LdaModel( corpus=corpus, # This code runs your lda id2word=dictionary, random_state=100, num_topics=10, passes=5, chunksize=10000, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) ## See the topics lda_model.print_topics(-1) #this allows to observe the topics lda_model.get_topic_terms(0, topn=10) # this provides the top 10 words in topic 0 lda_model.log_perplexity(corpus) # this compute the log perplexity lda_model.get_document_topics( corpus[0]
with open(os.path.join(path, 'data.tsv'), encoding='utf8') as f: reader = csv.reader(f, delimiter="\t") for line in reader: labels = line[0].split(', ') multi_hot_labels.append(labels) c = line[1:] c = clean_data(c) context.extend(c) #convert to multi-hot encoding mlb = MultiLabelBinarizer() labels = mlb.fit_transform(multi_hot_labels) label_list = list(mlb.classes_) token_context = [word_tokenize(x) for x in context] token_list = [] for x in token_context: temp = [i for i in x if not i in stop_words] token_list.append(temp) token_context = [clean_data(x) for x in token_list] del token_list common_dictionary = Dictionary(token_context) common_corpus = [common_dictionary.doc2bow(text) for text in token_context] # Train the model on the corpus. lda = LdaModel(common_corpus, id2word=common_dictionary, alpha='auto', num_topics=3, passes=5) print(lda.show_topic(2, 20))