def lda_and_SVC(train_group, train_group_label, test_data, test_label, num_topics): dct, bow = create_bow(train_group) lda_model = LdaModel(corpus=bow, num_topics=num_topics, id2word=dct) document_topics = [] for doc_bow in bow: ls = [] for top, prob in lda_model.get_document_topics( bow=doc_bow, minimum_probability=0.0): ls.append(prob) document_topics.append(ls) t_document_topics = [] for doc in test_data: doc_bow = dct.doc2bow(doc) ls = [] for top, prob in lda_model.get_document_topics( bow=doc_bow, minimum_probability=0.0): ls.append(prob) t_document_topics.append(ls) clf = svm.LinearSVC() clf.fit(document_topics, train_group_label) y_pred = clf.predict(t_document_topics) return metrics.accuracy_score(test_label, y_pred)
def topic_analysis(df, nTopics=5, cleanTextCol='cleaned_text'): df[cleanTextCol]=df[cleanTextCol].fillna('') cleandata = df[cleanTextCol].fillna('').apply(lambda x: x.split(' ')) dictionary = corpora.Dictionary(cleandata) tokens = [dictionary.doc2bow(d) for d in cleandata] model = LdaModel(tokens, num_topics=nTopics, id2word=dictionary, update_every=1, chunksize=50, passes=10, per_word_topics=True, alpha='auto') docweights = [model.get_document_topics(t, minimum_probability=0) for t in tokens] doctopics = pd.DataFrame(docweights).apply(lambda x: x.apply(lambda y: y[-1] if y else 0)) doctopics.columns = [f'topic{n+1}' for n in doctopics.columns] doctopics['KeyTopic']=doctopics.apply(lambda y:doctopics.columns[y==y.max()][0], axis=1) # create topicdescribe topics = model.show_topics(num_words=6) keywords = [re.findall(r'\*"(.*?)"',d[1]) for d in topics] weights = [re.findall(r'([\d\.]+)\*', d[1]) for d in topics] kwdf= pd.DataFrame(keywords, columns=[f'keyword_{n}' for n in range(len(keywords[0]))]) wtdf= pd.DataFrame(weights, columns=[f'weight_{n}' for n in range(len(weights[0]))]) topicDescribe = kwdf.merge(wtdf,left_index=True, right_index=True) topicDescribe[sorted(topicDescribe.columns, key=lambda x:x.split('_')[-1])] topicDescribe['KeyTopic'] = [f'topic{n+1}' for n in range(len(topics))] topicDescribe['TopicKeywords'] = [' '.join(k) for k in keywords] topicDescribe['DocCount'] = doctopics['KeyTopic'].value_counts().sort_index().values topicDescribe = topicDescribe[['KeyTopic']+[col for col in topicDescribe.columns if \ col != 'KeyTopic']] doctopics= doctopics.merge(topicDescribe[['KeyTopic','TopicKeywords']], on='KeyTopic', how='left') return doctopics, topicDescribe, model, tokens, dictionary
def do_cluster(obj, query): texts = [article['title'] for article in obj] processor = Processor(query) tokens = [processor.get_tokens(text) for text in texts] dictionary = corpora.Dictionary(tokens) corpus = [dictionary.doc2bow(token) for token in tokens] num_clusters = len(texts) / 5 model = LdaModel(corpus, num_topics=num_clusters, id2word=dictionary, update_every=5, chunksize=10000, passes=50) # size 10 topic_matrix = model.show_topics(formatted=False, num_topics=num_clusters) clusters = [{ "keywords": [str(word) for word, _ in topic[1]], "articles": [] } for topic in topic_matrix] for i, document in enumerate(corpus): topic = np.array(model.get_document_topics(document)) cluster = int(topic[np.argmax(topic[:, 1])][0]) clusters[cluster]['articles'].append(obj[i]) return clusters
def get_topics(data, filepath='./data/spam_topics.pkl'): if not os.path.exists(filepath): import pyLDAvis.gensim from gensim.corpora import Dictionary from gensim.models import LdaModel, CoherenceModel texts = [sample['lemmas'] for sample in data] dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=20, no_above=0.4) corpus = [dictionary.doc2bow(text) for text in texts] chunksize = 500 passes = 5 iterations = 400 eval_every = None temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token best_coherence = 0 best_model_filepath = '' for num_topics in list(range(2, 20)): for alpha in ['asymmetric', 'symmetric']: for eta in ['symmetric', 'auto']: filepath = 'out/topics/{}_{}_{}'.format(num_topics, alpha, eta) model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) coherence = float(CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()) filepath += '_{:.4f}'.format(coherence) model.save(filepath + '_model.pkl') prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(prepared, filepath + '_plot.html') if coherence > best_coherence: best_coherence = coherence best_model_filepath = filepath + '_model.pkl' model = LdaModel.load(best_model_filepath) print('Best model: {}'.format(best_model_filepath)) topics = [x[0] for x in model.top_topics(corpus=corpus, texts=texts, dictionary=dictionary, topn=100)] data_topics = [] for i, text in enumerate(texts): data_topics.append({k: v for k, v in model.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.0)}) pickle.dump([topics, data_topics], open(filepath, 'wb')) else: [topics, data_topics] = pickle.load(open(filepath, 'rb')) for i in range(len(data_topics)): data[i]['topics'] = data_topics[i] return topics, data
def get_document_topics_from_model(bow, lda: LdaModel) -> Dict[int, float]: """ A method used concurrently in create_document_topics :param lda: the lda model :param text: a document string :param dictionary: the dictionary over the whole document :return: a dict with the topics in the given document based on the lda model """ query = lda.get_document_topics(bow, minimum_probability=0.0) # 1/K is alternative threshold return dict(query)
def Lda_topic_model(docs, dictionary, nb_topics, true_labels): k = 5 lda = LdaModel(docs, num_topics=k, id2word=dictionary, passes=10) top_words = [[word[::-1] for word, _ in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)] top_betas = [[beta for _, beta in lda.show_topic(topic_id, topn=50)] for topic_id in range(lda.num_topics)] nb_words = 12 f, ax = plt.subplots(3, 2, figsize=(20, 15)) for i in range(nb_topics): # ax = plt.subplot(gs[i]) m, n = np.unravel_index(i, shape=(3, 2))[0], np.unravel_index(i, shape=(3, 2))[1] ax[m, n].barh(range(nb_words), top_betas[i][:nb_words], align='center', color='green', ecolor='black') ax[m, n].invert_yaxis() ax[m, n].set_yticks(range(nb_words)) ax[m, n].set_yticklabels(top_words[i][:nb_words]) ax[m, n].set_title("Topic " + str(i)) plt.show() # get distribution of docs on topics. dist_on_topics = lda.get_document_topics(docs) topic_predict = [] for d in dist_on_topics: p = 0 win_topic = 0 print(d) for i, t in enumerate(d): if t[1] > p: p = t[1] win_topic = t[0] print(win_topic) topic_predict.append(win_topic) mat = confusion_matrix(true_labels, topic_predict) print(mat) cluster_to_class = {} for i in range(5): cluster_to_class[i] = np.argmax(mat[:, i]) custom_labels = [cluster_to_class[c] for c in topic_predict] print("accuracy:", accuracy_score(true_labels, custom_labels)) print("f1_score micro: ", f1_score(true_labels, custom_labels, average='micro')) print("f1_score: macro", f1_score(true_labels, custom_labels, average='macro')) print("NMI", NMI(true_labels, custom_labels))
def calculate_fitness(self, gene): # Make LDA model self.fitness_budget -= 1 lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=gene.n, alpha=gene.a) if self.objective == 'coherence': cm = CoherenceModel(model=lda, corpus=self.corpus, coherence='u_mass') result = cm.get_coherence() elif self.objective == 'silhouette': labels = [] word_cntLst = [] if (len(self.corpus) < 2): gene.set_fitness(-99) return -99 for text in self.corpus: # Make label list topic_probLst = lda.get_document_topics(text) if (len(topic_probLst) == 0): print("LDA is f****d") print("GA.py gene.a = ", gene.a) if (0 in gene.a): print("calculate fitness: Zero in a") if (0 in gene.b): print("calculate fitness: Zero in b") gene.set_fitness(-99) return -99 labels.append(max(topic_probLst, key=lambda tup: tup[1])[0]) # Make word count list words = [0] * self.vocab_size for tup in text: words[tup[0]] = tup[1] word_cntLst.append(words[:]) # Calculate silhouette score if (len(np.unique(labels)) < 2): gene.set_fitness(-99) return -99 result = metrics.silhouette_score(word_cntLst, labels, metric='cosine') gene.set_fitness(result) return result
class LDA(): def __init__(self, K, data, AMask, params, name, dataName): self.K = K # [int] nb of topics self.AMask = AMask # [n_a,n_d float] matrix of author participation to each paper (1 if author participated to paper) self.n_a, self.n_d = self.AMask.shape # [int] nb authors self.D = data self.n_dic, self.n_d = self.D.shape self.name = name self.train_C_ = [] self.train_param = params['train_param'] for d in range(self.n_d): self.train_C_.append([(k, self.D[k, d]) for k in range(self.n_dic)]) self.dataName = dataName def train(self): self.LDA = LdaModel(self.train_C_, num_topics=self.K, decay=0.5, offset=1024, passes=80) self.phi = self.LDA.get_topics().transpose() self.theta = np.zeros((self.K, self.n_d)) for d in range(self.n_d): tmp = self.LDA.get_document_topics(self.train_C_[d]) ind = [c for (c, b) in tmp] self.theta[ind, d] = [b for (c, b) in tmp] self.D_reb = self.phi.dot(self.theta) self.A = normalize(self.AMask, 'l1', 0) return () def save(self, path): ''' path example ''' toSave = {} toSave['theta'] = self.theta toSave['phi'] = self.phi toSave['A'] = self.A toSave['K'] = self.K toSave['train_param'] = self.train_param with open(path + self.name + '_' + self.dataName + '.pkl', 'wb') as output: pickle.dump(toSave, output, pickle.HIGHEST_PROTOCOL)
class TopicModel(object): def __init__(self, documents, cut=True, num_topics=10, min_length=1): from cla.util.util import CutDocument from gensim.corpora import Dictionary from gensim.models import LdaModel self.document = CutDocument(documents, cut, cleanup=True, min_length=min_length) self.dictionary = Dictionary(self.document) self.model = LdaModel(BowCorpus(self.document, self.dictionary), id2word=self.dictionary, num_topics=num_topics) def topic_words(self, topic_id, limit=10): return self.model.show_topic(topicid=topic_id, topn=limit) def identify_topic(self, words): return self.model.get_document_topics(self.dictionary.doc2bow(words))
def BasicLDA(doclist, num_topics): start = time.clock() num_topics = num_topics texts = clean(doclist) print(texts[1]) # frequency = {} # for text in texts: # for token in text: # if token not in frequency: # frequency[token] = 0 # else: # frequency[token] += 1 dictionary = corpora.Dictionary(texts) size_dictionary = len(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=10, iterations=100) topics = [] for i in lda.show_topics(num_topics=-1, num_words=20): print(i) topics.append(i) for i in lda.get_document_topics(corpus): # i是按照词袋中的顺序,每个文档的主题分布 s = str(i) pattern1 = r'\((\d+),' a = re.findall(pattern1, s) print(a) # 匹配出每个文档的包含的主题标签 word_list = [] # 存放当前文档包含的所有的主题 for idx in a: # 取主题号 w = topics[int(idx)] # 取主题词分布 word_list.append(w) # 按照主题标签, 把对应主题的词分布 ,按照顺序存起来 l = [list(k)[1] for k in i] # list(k)[1] 每个主题的取概率 doc2top = {} for num in range(len(l)): doc2top[l[num]] = word_list[num] print(doc2top) break # print(list(chain.from_iterable(zip(l, word_list)))) elapsed = time.clock() - start return lda, corpus, dictionary, size_dictionary, elapsed
class MyLda: def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15): self.num_topics = num_topics self.topic_threshold = topic_threshold self.myDictionary = myDictionary self.model = LdaModel(self.myDictionary.doc2bows, \ id2word=self.myDictionary.dictionary, \ num_topics=num_topics) self.topic2ids, self.id2topics = self.get_mappings() self.coherenceModel = None print("- Created MyLda with {} topics".format(self.num_topics)) def get_mappings(self): topic2ids, id2topics = defaultdict(list), defaultdict(list) for i, doc2bow in enumerate(self.myDictionary.doc2bows): topic_pairs = self.model.get_document_topics(doc2bow) for j, (topic, prob) in enumerate(topic_pairs): if prob >= self.topic_threshold or j == 0: topic2ids[topic].append(i) id2topics[i].append(topic) return topic2ids, id2topics def get_topic_terms(self, topic): terms = self.model.get_topic_terms(topic) return terms def get_top_topic(self): top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows) average = sum([t[1] for t in top_topics]) / self.num_topics return top_topics, average def get_perplexity(self): return self.model.log_perplexity(self.myDictionary.doc2bows) def get_coherence(self): if not self.coherenceModel: self.coherenceModel = CoherenceModel(model=self.model, \ corpus=self.myDictionary.doc2bows, \ dictionary=self.myDictionary.dictionary, \ coherence='u_mass') return self.coherenceModel.get_coherence()
def lda_topics(processed_data: list, n_topics: int = 10, learning_decay: float = 0.5, learning_offset: float = 1.0, max_iter: int = 50, n_words: int = 10) -> Tuple[list, list]: """ lda_topics perfoms LDA topic modeling on the input data :param processed_data: list of preprocessed segments :param n_topics: number of topics to extract form corpus :param learning_decay: learning decay parameter for LDA :param learning_offset: learning offset parameter for LDA :param max_iter: max. number of interations :param n_words: number of topic representatives :return: - topics - list of topics (and their representatives - doc_topics - list of predicted topics, one for each segment """ dictionary = corpora.Dictionary(processed_data, ) doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_data] lda_model = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics, offset=learning_offset, random_state=42, update_every=1, iterations=max_iter, passes=10, alpha='auto', eta="auto", decay=learning_decay, per_word_topics=True) topics = [] for i_t, topic_word_dist in enumerate(lda_model.get_topics()): topic = [lda_model.id2word[w_id] for w_id, _ in lda_model.get_topic_terms(i_t, topn=n_words)] topics.append(topic) # getting documents topic labels doc_topics = [] for doc in doc_term_matrix: doc_t_dist = sorted(lda_model.get_document_topics(doc), key=lambda item: item[1], reverse=True) t, _ = doc_t_dist[0] doc_topics.append(t) assert len(doc_topics) == len(processed_data) return topics, doc_topics
class LDAVecGen(): def __init__(self, path): with open(path, 'rb') as fp: data = pickle.load(fp) self.id_list = data['id'] self.doc_list = list(map(lambda doc: doc.split(' '), data['doc'])) def fit_model(self, topic_num): self.dictionary = Dictionary(self.doc_list) self.corpus = [self.dictionary.doc2bow(doc) for doc in self.doc_list] self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=topic_num) def out(self, model_path, col_name): buffer = [] vecs = self.model.get_document_topics(self.corpus) col = db[col_name] for id, vec in zip(self.id_list, vecs): buffer.append({ 'fulltextid': id, 'vec': [[item[0], float(item[1])] for item in vec] }) if len(buffer) >= 1000: col.insert_many(buffer) buffer.clear() if len(buffer) > 0: col.insert_many(buffer) buffer.clear() if not os.path.exists(model_path): os.makedirs(model_path) self.dictionary.save(os.path.join(model_path, 'lda.dic')) self.model.save(os.path.join(model_path, 'lda.model'))
def build_lda_model(tokens_tags, pos_tags, use_nouns=True, use_verbs=True, use_all=False, num_of_topics=8, passes=25, verbose=True): path = os.getcwd()[:os.getcwd().rfind('/')] topics_filename = str(num_of_topics) + "topics" if use_nouns: topics_filename += "_nouns" if use_verbs: topics_filename += "_verbs" if use_all: topics_filename += "_all" # Set the LDA, Dictionary and Corpus filenames lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model" dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict" corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm" # Build a topic model if it wasn't created yet if not os.path.exists(lda_filename): # Extract the lemmatized documents docs = [] for index in range(len(tokens_tags)): tokens = tokens_tags[index].split() pos = pos_tags[index].split() docs.append( data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs, use_nouns, use_all)) # Compute the dictionary and save it dictionary = Dictionary(docs) dictionary.filter_extremes(keep_n=40000) dictionary.compactify() Dictionary.save(dictionary, dict_filename) # Compute the bow corpus and save it corpus = [dictionary.doc2bow(d) for d in docs] MmCorpus.serialize(corpus_filename, corpus) if verbose: print("\nCleaned documents:", docs) print("\nDictionary:", dictionary) print("\nCorpus in BoW form:", corpus) # Start training an LDA Model start = time.time() print("\nBuilding the LDA topic model...") lda_model = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary) lda_model.save(lda_filename) end = time.time() print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) if verbose: print("\nList of words associated with each topic:") lda_topics = lda_model.show_topics(formatted=False) lda_topics_list = [[word for word, prob in topic] for topic_id, topic in lda_topics] print([t for t in lda_topics_list]) # Load the previously saved dictionary dictionary = Dictionary.load(dict_filename) # Load the previously saved corpus mm_corpus = MmCorpus(corpus_filename) # Load the previously saved LDA model lda_model = LdaModel.load(lda_filename) # Print the top 10 words for each topic if verbose: for topic_id in range(num_of_topics): print("\nTop 10 words for topic ", topic_id) print([ dictionary[word_id] for (word_id, prob) in lda_model.get_topic_terms(topic_id, topn=10) ]) index = 0 if verbose: for doc_topics, word_topics, word_phis in lda_model.get_document_topics( mm_corpus, per_word_topics=True): print('Index ', index) print('Document topics:', doc_topics) print('Word topics:', word_topics) print('Phi values:', word_phis) print('-------------- \n') index += 1 return dictionary, mm_corpus, lda_model
class LDAModel(object): """ """ def __init__(self,path,model_file,dictionary_file,corpus_file,num_topics=21): """ 进行数据预处理,获取训练集和测试集 class biological分子与细胞_cleaned.csv : 12 class biological现代生物技术专题_cleaned.csv : 14 class biological生物技术实践_cleaned.csv : 16 class biological生物科学与社会_cleaned.csv : 18 class biological稳态与环境_cleaned.csv : 110 class biological遗传与进化_cleaned.csv : 112 class geography人口与城市_cleaned.csv : 42 class geography区域可持续发展_cleaned.csv : 44 class geography地球与地图_cleaned.csv : 46 class geography宇宙中的地球_cleaned.csv : 48 class geography生产活动与地域联系_cleaned.csv : 410 class history古代史_cleaned.csv : 52 class history现代史_cleaned.csv : 54 AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks' Exception ignored in: '_pydevd_frame_eval.pydevd_frame_evaluator_darwin_36_64.get_bytecode_while_frame_eval' AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks' class history近代史_cleaned.csv : 56 class political公民道德与伦理常识_cleaned.csv : 102 class political时事政治_cleaned.csv : 104 class political生活中的法律常识_cleaned.csv : 106 class political科学思维常识_cleaned.csv : 108 class political科学社会主义常识_cleaned.csv : 1010 class political经济学常识_cleaned.csv : 1012 :param file:语料文件 :param ratio:测试训练的比列 :return lda:返回lda模型 """ dirs = os.listdir(path) x_list = [] item_x = [] labels = [] multiLabels = [] label11 = 0 for file in dirs: #print(os.path.join(path, file)) path2 = os.path.join(path, file) if os.path.isdir(path2): category = file dirs2 = os.listdir(path2) label12 = 0 for file2 in dirs2: file3 = os.path.join(path2, file2) if os.path.isfile(file3) and file2.endswith('_cleaned.csv'): print('class {}{} : {}{}'.format(file, file2, label11, label12)) src_df = pd.read_csv(file3) src_df = parallelize(src_df, data_fram_proc) #上采样 #merged_df = pd.concat([src_df['items'], src_df['knowledge']], axis=1) src_df['item'] = src_df['items'] + src_df['knowledge'] x = np.array(src_df['item']).tolist() item_x += x x = [[word for word in doc.split(' ') if word != "" ] for doc in x] x_list+= x # list #labels += ['__label__'+str(label11)+''+str(label12) for i in range(len(x))] fn = str(file2).replace('_cleaned.csv','').replace('\t','').replace('\n','') labels += ['__label__' + str(file) + '_' + fn for i in range(len(x))] bug = 0 mls = np.array(src_df['label']).tolist() multiLabels += [ str(file).replace('_',' ') +' '+fn+' '+ str(ml).replace('\t','').replace('\n','') for ml in mls ] bug = 1 label12 += 1 label11 += 1 c = {'label': labels,'item': item_x,'multiLabels':multiLabels} # 合并成一个新的字典c df = pd.DataFrame(c) # 将c传入DataFrame并创建 df.to_csv(corpus_file, index=None, header=True) # 把文章转成list,字典里面 "token2id " self.dictionary = Dictionary(x_list) # 把文本转成词袋形式 id : freq self.corpus = [self.dictionary.doc2bow(text) for text in x_list] # 调用lda模型,并指定10个主题 self.lda = LdaModel(self.corpus, id2word=self.dictionary, num_topics=num_topics) # 检查结果 results = self.lda.print_topics(num_topics, num_words=50) for result in results: print(result) # Save model to disk. self.lda.save(model_file) self.dictionary.save_as_text(dictionary_file) def __retrain(self, model_file,other_texts): """ lda = LdaModel.load(model_file) other_corpus = [self.dictionary.doc2bow(text) for text in other_texts] lda.update(other_corpus) """ def getDocSVector(self): self.docSVector = [] for d in self.corpus: self.docSVector.append(self.lda.get_document_topics(d,minimum_probability = 0)) return self.docSVector
num_topics = 30 lda_model = LdaModel(corpus=corpus, id2word=id2word, alpha='auto', eta='auto', num_topics=num_topics) pickle.dump(lda_model, open("gensim_lda_model.p", "wb")) pickle.dump(dictionary, open("gensim_lda_dictionary.p", "wb")) diccc = pickle.load(open("gensim_lda_dictionary.p", "rb")) s = '' doc_topic = [] for i in range(len(corpus)): doc_topic.append(lda_model.get_document_topics(corpus[i])) doc_topic_temp = [] for i in range(len(corpus)): temp = [] for i in range(0, num_topics): a = 0 temp.append(a) doc_topic_temp.append(temp) longg = len(doc_topic) for i in range(0, longg): #print(doc_topic[i]) #print(i) kk = len(doc_topic[i]) for a in range(kk):
class CustomLDA: ''' custimized lda model ''' def __init__(self, documents, titles, dictionary): self.documents = documents self.titles = titles self.dictionary = dictionary self.model = None return def train( self, train_data, val_data, output_path, num_topics=1000, iterations=100, chunksize=2000, passes=1, eval_every=1, ): ''' train/val a model and save the trained model. Args: train_data (DataFrame): training data val_data (DataFrame): validation data output_path: where to save models num_topics: the number of topics iterations: train iterations eval_every: eval model every `eval_every` iterations Returns: model object ''' val_data = pd.concat([train_data, val_data], ignore_index=True) self.model = LdaModel( corpus=self.documents.bow.tolist(), id2word=self.dictionary.id2token, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, chunksize=chunksize, eval_every=eval_every, callbacks=[ # utils.EpochSaver(output_path), # utils.EpochLogger(log_start=True), # utils.SupervisedEvalute(val_data, documents, titles) # CoherenceMetric(corpus=documents.bow, logger='shell'), # ConvergenceMetric(logger='shell'), ], ) return self def validate(self, data): '''validate this model''' prediction = self.predict(data) mrr = utils.calculate_MRR(prediction) return mrr def predict(self, data, progress=True): ''' make a prediction Args: data: input data progress: whether progress bar should be displayed Returns: prediction results ''' if progress: tqdm.pandas('predicion') data = data.progress_apply( self.sort_candidates, axis=1, ) else: data = data.apply( self.sort_candidates, axis=1, ) return data def sort_candidates(self, series, log_before=False, log_after=False): ''' sort candidate titles contained in series. Args: series: pd.Series with index[title_id, candidates] log_before: whether this func should log candidates before sorting log_after: whether this func should log candidates after sorting Returns: series ''' title_info = self.titles.loc[series.title_id] if log_before: print(list(map( lambda doc_id: utils.get_coherence( self.model.get_document_topics(self.documents.loc[doc_id].bow), title_info.bow ), series.candidates, ))) series.candidates = sorted( series.candidates, key=lambda doc_id: - utils.get_coherence( self.model.get_document_topics(self.documents.loc[doc_id].bow, 0), title_info.bow ), ) if log_after: print(list(map( lambda doc_id: self.get_coherence( self.model.get_document_topics(self.documents.loc[doc_id].bow, 0), title_info.bow ), series.candidates, ))) print() return series def save(self, path): '''save''' self.model.save(path) return self def load(self, path): '''load''' self.model = LdaModel.load(path) return self
def LDA(texts, num_topics, token2tag, token2tag_dic, dic): num_topics = num_topics '''将输入文本中的所有的词,按照相似性,变成tag_num''' '''将文本中的词替换称编号,如果词不在字典中,则增加到字典中''' # 获取未替换的文本的tfidf dictionary = corpora.Dictionary(texts) token2id = dictionary.token2id id2token = {v: k for k, v in token2id.items()} # dictionary.save('patent_dictionary.dict') corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('patent_corpuse.mm', corpus) tfidf = models.TfidfModel(corpus) # tfidf.save('patent_tfidf.model') corpus_tfidf = tfidf[corpus] sort_keywords = [] # 将tfidf进行降序排序,获取前三的tfidf的词作为关键词,否则就取一个。 for doc in corpus_tfidf: sorttfidf = sorted(doc, key=lambda x: x[1], reverse=True) n = 0 if len(sorttfidf) >= 3: sort_keywords.append([id2token[sorttfidf[0][0]], id2token[sorttfidf[1][0]], id2token[sorttfidf[2][0]]]) elif len(sorttfidf) == 2: sort_keywords.append([id2token[sorttfidf[0][0]], id2token[sorttfidf[1][0]]]) elif len(sorttfidf) == 1: sort_keywords.append([id2token[sorttfidf[0][0]]]) for text in texts: for word in text: text[text.index(word)] = str(token2tag[word]) topic = {} for key, item in token2tag_dic.items(): topicword = ' '.join(list(item)) # print(topicword) topic[str(key)] = topicword insert = "insert into topicword_info(主题词ID,主题词) value (%s,%s)" # print(insert,[key,topicword]) cur.execute(insert, [key, topicword]) con.commit() start = time.clock() dictionary = corpora.Dictionary(texts) dictionary.save('patent_dictionary.dict') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('patent_corpuse.mm', corpus) tfidf = models.TfidfModel(corpus) tfidf.save('patent_tfidf.model') corpus_tfidf = tfidf[corpus] # sort_keywords = [] # #将tfidf进行降序排序,获取前三的tfidf的词作为关键词,否则就取一个。 # for doc in corpus_tfidf: # sorttfidf = sorted(doc, key=lambda x: x[1], reverse=True) # n = 0 # if len(sorttfidf)>=3: # sort_keywords.append([sorttfidf[0][0],sorttfidf[1][0],sorttfidf[2][0]]) # else: # sort_keywords.append([sorttfidf[0][0]]) # print('关键词',sort_keywords) size_dictionary = len(dictionary) # 所有单词的长度 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=100, iterations=100) lda.save('patent_lda.model') topic1 = {} # 用来存放每个主题的词分布 按照从0到最大的顺序 for i in lda.show_topics(num_topics=-1, num_words=20): s = str(i) pattern1 = r'"(\d+)"' a = re.findall(pattern1, s) # 匹配出 词簇tag topic1[i[0]] = [topic[a[0]], topic[a[1]], topic[a[2]], topic[a[3]], topic[a[4]]] # 用来存放每个主题的词分布 按照从0到最大的顺序 insert = 'insert into topic_info(主题ID,主题词ID1,主题词ID2,主题词ID3,主题词ID4) values(%s,%s,%s,%s,%s)' # print(insert,[i[0],a[0],a[1],a[2],a[3]]) cur.execute(insert, [i[0], a[0], a[1], a[2], a[3]]) con.commit() # word_list = [] # for i in a: # w = token2tag_dic[int(i)] # word_list.append(w) # pattern2 = r'"\d+"' # st = re.split(pattern2, s) # topics.append(list(chain.from_iterable(zip(st, word_list)))) # print(list(chain.from_iterable(zip(st, word_list)))) c = 0 topics = [] ppp = [] for i in lda.get_document_topics(corpus): # i是按照词袋中的顺序,每个文档的主题分布 s = str(i) ppp.append([]) vec = [0.0] * 30 for j in i: a = int(j[0]) vec[a] = j[1] ppp[-1].extend([j[1],topic1[j[0]]]) topics.append(vec) # print(vec) pattern1 = r'\((\d+),' a = re.findall(pattern1, s) # print(a) #匹配出每个文档的包含的主题标签 keyword = sort_keywords[c] # print(keyword) # guanjianci = ' '.join([str(token2tag_dic[int(j)]) for j in keyword]) #[1,2,3] 按照索引把 guanjianci = ' '.join(keyword) # [1,2,3] 按照索引把 detail = dic[c] name, abstract, id = detail['专利名称'], detail['专利摘要'], detail['专利号'] if len(a) >= 4: insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题ID2,主题ID3,主题ID4,主题向量) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)' cur.execute(insert, [name, abstract, id, guanjianci, a[0], a[1], a[2], a[3], str(vec)]) con.commit() c += 1 else: if len(a) == 1: insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题向量) values(%s,%s,%s,%s,%s,%s)' cur.execute(insert, [name, abstract, id, guanjianci, a[0], str(vec)]) con.commit() c += 1 elif len(a) == 2: insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题ID2,主题向量) values(%s,%s,%s,%s,%s,%s,%s)' cur.execute(insert, [name, abstract, id, guanjianci, a[0], a[1], str(vec)]) con.commit() c += 1 elif len(a) == 3: insert = 'insert into patent_info(专利名,专利摘要,专利号,关键词,主题ID1,主题ID2,主题ID3,主题向量) values(%s,%s,%s,%s,%s,%s,%s,%s)' cur.execute(insert, [name, abstract, id, guanjianci, a[0], a[1], a[2], str(vec)]) con.commit() c += 1 for i in ppp: print(i) with open('lda文本的主题词', 'w', encoding='utf8') as f: f.write(str(ppp)) # word_list = [] #存放当前文档包含的所有的主题 # for idx in a: #取主题号 # w = topics[int(idx)] #取主题词分布 # word_list.append(w) #按照主题标签, 把对应主题的词分布 ,按照顺序存起来 # # l = [list(k)[1] for k in i] #list(k)[1] 每个主题的取概率 # doc2top = {} # for num in range(len(l)): # doc2top[l[num]] = word_list[num] # # print(doc2top) # print(list(chain.from_iterable(zip(l, word_list)))) # doctopic = [] # for i in lda.get_document_topics(corpus)[:]: # listj = [] # for j in i: # listj.append(j[1]) # bz = listj.index(max(listj)) # # k = i[bz][0] # doctopic.append(k) with open('lda_topic', 'w', encoding='utf8') as f: f.write(str(topics)) elapsed = time.clock() - start return lda, corpus, dictionary, size_dictionary, elapsed
tokens = vectorizer.get_feature_names() vocab_size = len(tokens) pd.Series(tokens).to_csv(token_path, index=False) id2word = pd.Series(tokens).to_dict() corpus = Sparse2Corpus(dtm, documents_columns=False) # dictionary = Dictionary.from_corpus(corpus=train_corpus, id2word=id2word) # for n_topics in [3, 5, 7, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 75, 100]: for n_topics in [5, 10, 15, 20, 30]: print(n_topics, end=' ', flush=True) lda = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word) doc_topics = pd.DataFrame() for i, topics in enumerate(lda.get_document_topics(corpus)): doc_topics = pd.concat([ doc_topics, pd.DataFrame(topics, columns=['topic', 'value']).assign(doc=i) ]) doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv', index=False) model_file = datapath((model_path / f'{key}_{n_topics}').resolve()) lda.save(model_file) train_lda = LdaModel(corpus=train_corpus, num_topics=n_topics, id2word=pd.Series(train_tokens).to_dict()) # see https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.log_perplexity test_perplexity = 2**(-train_lda.log_perplexity(test_corpus))
lda = LdaModel( corpus=corpus, num_topics=num_topics, id2word=dct, alpha='auto', random_state=100, # eta=None, update_every=1, chunksize=chunk_size, minimum_probability=0.0, # iterations=100, # gamma_threshold=0.001, passes=10, per_word_topics=True) lda.get_document_topics(bow=corpus, per_word_topics=True) tpl = lda.print_topics(num_topics=6, num_words=5) topic, contrib = zip(*tpl) t2 = time.time() print("Time to train LDA model on", len(df), "articles:", (t2 - t1) / 60, "min") top_k_topics = lda.top_topics(corpus, topn=5, dictionary=dct, texts=train_df['tokenized']) indx = [i + 1 for i in range(6)] contrib = np.transpose(contrib) #%% tpl = lda.print_topics(num_topics=6, num_words=5)
class TopicsService(): data_service = None model = gensim.models.basemodel.BaseTopicModel() def model_save(self, _model, _model_name, _active_dataset): """ Save trained model to disk @params: _model - Required : model (gensim.models.LdaModel) _model_name - Required : name of the active topic model (Str) _active_dataset - Required : name of active dataset (Str) """ result = self.data_service.save_model(_model, _model_name, _active_dataset) return result def model_load(self, _model_name, _active_dataset): """ Load pre-trained model from disk @params: _model_name - Required : name of the topic mining model(Str) _active_dataset - Required : name of active dataset (Str) """ # TODO: тут надо проверять что созданы все нужные файлы # TODO: перенести загрузку в слой работы с данными try: self.model = models.LdaModel.load( config.path2data + _active_dataset + "." + _model_name + ".model") # self.model = models.LdaModel.load(config.path2data + "lda.model") except Exception as e: self.model = self.process_topics_mining(_active_dataset) self.model = models.LdaModel.load( config.path2data + _active_dataset + "." + _model_name + ".model") return True def __init__(self, _model_name, _active_dataset): """ Constructor @params: _model_name - Required : name of the active topic model (Str) _active_dataset - Required : name of active dataset (Str) """ self.data_service = DataService(_active_dataset) self.model_load(_model_name, _active_dataset) def topic_mining(self, _active_dataset): """ Internal function for process topic mining and sace trained models @params: _active_dataset - Required : name of active dataset (Str) """ # TODO: перенести сохранение в файлы в слой models print("topic minning start..") vectorizer = TfidfVectorizer(max_df=0.5, max_features=500, min_df=2, stop_words='english', use_idf=True) _cousines, _reviews = self.data_service.get_reviews_for_cousines() print("text uploaded") text = _reviews X = vectorizer.fit_transform(text) print("text transformed") # mapping from feature id to acutal word id2words = {} for i, word in enumerate(vectorizer.get_feature_names()): id2words[i] = word corpus = matutils.Sparse2Corpus(X, documents_columns=False) print("train LDA models") ##################################################################### _model_name = "LDA10" self.modelLDA_10 = LdaModel(corpus, num_topics=10, id2word=id2words) self.model_save(self.modelLDA_10, _model_name, _active_dataset) _cousines2topics = self.modelLDA_10.get_document_topics( corpus, minimum_probability=0) _topics2cousines = [] for i, _topics_weight in enumerate(_cousines2topics): _topics2cousines.append([_cousines[i], _topics_weight]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f: pickle.dump(_topics2cousines, f) _rests_topics = [] _rests = self.data_service.get_rests() for _rest in _rests: _cousines = self.data_service.get_cousines_for_rest(_rest[0]) _rest_vector = [] for _rc in _cousines: for _c in _topics2cousines: if _c[0] == _rc: if not _rest_vector: # _rest_vector = _c[1] for _t, _w in _c[1]: _rest_vector.append([_t, float(_w)]) else: for _t, _w in _c[1]: _rest_vector[_t][1] = (_rest_vector[_t][1] + float(_w)) / 2 _rests_topics.append([_rest, _rest_vector, _cousines]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f: pickle.dump(_rests_topics, f) ##################################################################### _model_name = "LDA15" self.modelLDA_15 = LdaModel(corpus, num_topics=15, id2word=id2words) self.model_save(self.modelLDA_15, _model_name, _active_dataset) _cousines, _reviews = self.data_service.get_reviews_for_cousines() _cousines2topics = self.modelLDA_15.get_document_topics( corpus, minimum_probability=0) _topics2cousines = [] for i, _topics_weight in enumerate(_cousines2topics): _topics2cousines.append([_cousines[i], _topics_weight]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f: pickle.dump(_topics2cousines, f) _rests_topics = [] _rests = self.data_service.get_rests() for _rest in _rests: _cousines = self.data_service.get_cousines_for_rest(_rest[0]) _rest_vector = [] for _rc in _cousines: for _c in _topics2cousines: if _c[0] == _rc: if not _rest_vector: # _rest_vector = _c[1] for _t, _w in _c[1]: _rest_vector.append([_t, float(_w)]) else: for _t, _w in _c[1]: _rest_vector[_t][1] = ( _rest_vector[_t][1] + float(_w)) / 2 _rests_topics.append([_rest, _rest_vector, _cousines]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f: pickle.dump(_rests_topics, f) ##################################################################### _model_name = "LDA20" self.modelLDA_20 = LdaModel(corpus, num_topics=20, id2word=id2words) self.model_save(self.modelLDA_20, _model_name, _active_dataset) _cousines, _reviews = self.data_service.get_reviews_for_cousines() _cousines2topics = self.modelLDA_20.get_document_topics( corpus, minimum_probability=0) _topics2cousines = [] for i, _topics_weight in enumerate(_cousines2topics): _topics2cousines.append([_cousines[i], _topics_weight]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2topics2cousines, 'wb') as f: pickle.dump(_topics2cousines, f) _rests_topics = [] _rests = self.data_service.get_rests() for _rest in _rests: _cousines = self.data_service.get_cousines_for_rest(_rest[0]) _rest_vector = [] for _rc in _cousines: for _c in _topics2cousines: if _c[0] == _rc: if not _rest_vector: # _rest_vector = _c[1] for _t, _w in _c[1]: _rest_vector.append([_t, float(_w)]) else: for _t, _w in _c[1]: _rest_vector[_t][1] = ( _rest_vector[_t][1] + float(_w)) / 2 _rests_topics.append([_rest, _rest_vector, _cousines]) with open(config.path2data + _active_dataset + "." + _model_name + "_" + config.path2rests2topics, 'wb') as f: pickle.dump(_rests_topics, f) ################################################################### print("TRAIN MODELS DONE") return self.modelLDA_10 def get_topics(self, _num_words=10): """ Return a list of topics @params: _num_words - Option : number of words for each topic to return (Int) """ _topics = [] for i, item in enumerate(self.model.show_topics(num_topics=20, num_words=_num_words, formatted=False)): _topic_words = [] for term, weight in item[1]: _topic_words.append([term, weight]) _topics.append([i, _topic_words]) return _topics def process_topics_mining(self, _active_dataset): """ Process topic mining and sace trained models @params: _active_dataset - Required : name of active dataset (Str) """ _result = self.topic_mining(_active_dataset) return _result def get_review_number(self): """ Return a number of reviews in active dataset @params: """ return self.data_service.get_review_number()
from gensim.models import LdaModel from gensim import corpora from pprint import pprint import pandas as pd DIR = "LDAs/" data_set = "smartplugs1130-merged-lemmatized" model_type = "nt100na0.1-1" model = data_set + model_type lda = LdaModel.load(DIR + model) mm_file = data_set + '.mm' mm = corpora.MmCorpus(mm_file) DIR = "data/" filename = data_set + ".csv" file = DIR + filename df = pd.read_csv(file) reviews = df.Review r = 3 review = mm[r] topic_dist = LdaModel.get_document_topics(lda, review) topics = [x[0] for x in topic_dist] print(topics) pprint(LdaModel.print_topics(lda, -1, 10))
class W2V_cpp2(W2V_base): def __init__(self,n_topic, path, folder): self.n_topic = n_topic W2V_base.__init__(self, path, folder) #process dict for prod_id in self.idx2prod.keys(): prod = self.idx2prod[prod_id] n_prod_id = prod_id - len(self.word_count) - 1 del self.idx2prod[prod_id] self.idx2prod[n_prod_id] = prod self.prod2idx[prod] = n_prod_id for user_id in self.idx2user.keys(): user = self.idx2user[user_id] n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1 del self.idx2user[user_id] self.idx2user[n_user_id] = user self.user2idx[user] = n_user_id def train(self): data = [] entity2id = {} id2entity = [] for obj in self.data: doc = [] obj_sents = obj["text_data"] entity = obj["prod"] if entity not in entity2id: entity2id[entity] = len(entity2id) id2entity.append(entity) doc_id = entity2id[entity] for obj_sent in obj_sents: for pair in obj_sent: if pair[0] >= 0: doc.append((pair[0], doc_id)) data.append(doc) self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic) f_entity = open("lda/prod.txt", "w") f_model = open("lda/model.txt", "w") f_model.write(str(len(entity2id))) f_model.write(" ") f_model.write(str(self.n_topic)) f_model.write("\n") for entity in id2entity: f_entity.write(entity) f_entity.write("\n") f_model.write(entity) f_model.write(" ") distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0) distr = [pair[1] for pair in distr] for prod in distr: f_model.write(str(prod)) f_model.write(" ") f_model.write("\n") self.ldamodel.save("lda/model_200")
corpus=corpus, id2word=dictionary.id2token, chunksize=1000, alpha='asymmetric', eta='auto', iterations=iterations, num_topics=args.num_topics, passes=passes, eval_every=None ) topic_tokens = [] for topicid in range(args.num_topics): topic_tokens.append([dictionary.id2token[k[0]] for i, k in enumerate(model.get_topic_terms(topicid, topn=4)) if i < 2 or k[1] > 0.025]) paper_topic_data = [] for paper, paper_bow in zip(data, corpus): topic_distr = model.get_document_topics(paper_bow, minimum_probability=0) paper_topic_data.append({ "key": paper["key"], "year": paper["year"], "title": paper["title"], "topic_distr": {t: float(p) for t, p in topic_distr} }) with open(args.outpath, 'w') as f: json.dump({ "topics": topic_tokens, "paper_data": paper_topic_data }, f)
dct = Dictionary(corpus) # transfer into corpus corpus = [dct.doc2bow(doc) for doc in corpus] # train lda model lda = LdaModel(corpus, num_topics=2, id2word=dct) # 参数(bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False) # Parameters: # bow (list) – Bag-of-words representation of the document to get topics for. # minimum_probability (float) – Ignore topics with probability below this value (None by default). If set to None, a value of 1e-8 is used to prevent 0s. # per_word_topics (bool) – If True, also returns a list of topics, sorted in descending order of most likely topics for that word. It also returns a list of word_ids and each words corresponding topics’ phi_values, multiplied by feature length (i.e, word count). # minimum_phi_value (float) – if per_word_topics is True, this represents a lower bound on the term probabilities that are included (None by default). If set to None, a value of 1e-8 is used to prevent 0s. # Returns: # topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples. test = dct.doc2bow("I love Kitten".lower().strip().split()) print(lda.get_document_topics(test)) print(lda[test]) # 参数(word_id, minimum_probability=None) # 关联的topics for the given word. # Each topic is represented as a tuple of (topic_id, term_probability). print(lda.get_term_topics(0)) # ----- 输出指定topic的构成 ----- # 参数(word_id, minimum_probability=None) # 输出形式 list, format: [(word, probability), … ]. print(lda.get_topic_terms(0)) # 参数(topicno, topn=10) print(lda.show_topic(0)) # 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘. # 参数(topicno, topn=10)
def basicLDA(texts, num_topics, ): num_topics = num_topics # 获取未替换的文本的tfidf dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] start = time.clock() lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=500, passes=100, iterations=100) lda.save('patent_basiclda.model') topic = {} # 用来存放每个主题的词分布 按照从0到最大的顺序 for i in lda.show_topics(num_topics=-1, num_words=20): s = str(i) pattern1 = "[\u4e00-\u9fa5]+" a = re.findall(pattern1, s) # 匹配出 词簇tag topic[i[0]] = i[1] # 用来存放每个主题的词分布 按照从0到最大的顺序 # # word_list = [] # # for i in a: # # w = token2tag_dic[int(i)] # # word_list.append(w) # # pattern2 = r'"\d+"' # # st = re.split(pattern2, s) # # topics.append(list(chain.from_iterable(zip(st, word_list)))) # # print(list(chain.from_iterable(zip(st, word_list)))) c = 0 ppp = [] topics = [] for i in lda.get_document_topics(corpus): # i是按照词袋中的顺序,每个文档的主题分布 s = str(i) vec = [0.0] * 30 ppp.append([]) for j in i: a = int(j[0]) vec[a] = j[1] ppp[-1].extend([j[1],topic[j[0]]]) topics.append(vec) for i in ppp: print(i) with open('basiclda文本的主题词', 'w', encoding='utf8') as f: f.write(str(ppp)) # print(vec) # pattern1 = r'\((\d+),' # a = re.findall(pattern1, s) # print(a) #匹配出每个文档的包含的主题标签 # print(keyword) # guanjianci = ' '.join([str(token2tag_dic[int(j)]) for j in keyword]) #[1,2,3] 按照索引把 # word_list = [] #存放当前文档包含的所有的主题 # for idx in a: #取主题号 # w = topics[int(idx)] #取主题词分布 # word_list.append(w) #按照主题标签, 把对应主题的词分布 ,按照顺序存起来 # # l = [list(k)[1] for k in i] #list(k)[1] 每个主题的取概率 # doc2top = {} # for num in range(len(l)): # doc2top[l[num]] = word_list[num] # # print(doc2top) # print(list(chain.from_iterable(zip(l, word_list)))) # doctopic = [] # for i in lda.get_document_topics(corpus)[:]: # listj = [] # for j in i: # listj.append(j[1]) # bz = listj.index(max(listj)) # # k = i[bz][0] # doctopic.append(k) with open('basiclda_topic', 'w', encoding='utf8') as f: f.write(str(topics)) elapsed = time.clock() - start return lda, corpus, dictionary, elapsed
chunksize=10000, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) ## See the topics lda_model.print_topics(-1) #this allows to observe the topics lda_model.get_topic_terms(0, topn=10) # this provides the top 10 words in topic 0 lda_model.log_perplexity(corpus) # this compute the log perplexity lda_model.get_document_topics( corpus[0] ) # This provide the document topic distribution. Note that by default, when a document has a low probability on a topic, it is not displayed lda_model.get_document_topics( corpus[0], minimum_probability=0 ) # This provide the document topic distribution. Here, every topics and associated probabilities are printed. ### Document topic #### # Plotting tools import pyLDAvis import pyLDAvis.gensim # don't skip this import matplotlib.pyplot as plt vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) pyLDAvis.show(vis) 8
print('Reading dataset') data = pd.read_parquet(args.input_filepath) print('Normalizing text') data.text = data.text.map(nlp.normalize_text) print('Building docterm matrix') docterm, dictionary = nlp.get_docterm_matrix(data.text) doclength = np.array([sum(x[1] for x in doc) for doc in docterm]) print('Training LDA model') lda = LdaModel(docterm, num_topics=args.n_topics) print('Getting document topics') doctopics = corpus2csc([lda.get_document_topics(doc) for doc in docterm]) termtopics = lda.get_topics() print('Computing topic volume time series') topic_volume_over_time = nlp.get_topic_volume_over_time(data, doctopics, 20) print('Computing topic coordinates') topic_coordinates = nlp.get_topic_coordinates(termtopics, method='mds') topic_proportions = nlp.get_topic_proportions(doctopics, doclength) print('Computing term frequencies') term_frequencies = nlp.get_term_frequencies(docterm, termtopics, topic_proportions, doclength) print('Computing term ranks per topic') term_ranks = nlp.get_topic_term_ranks(docterm, termtopics)
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] >= 1] for text in texts] from pprint import pprint # pretty-printer dictionary = corpora.Dictionary(texts) # dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference # print(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) lda = LdaModel(corpus, num_topics=2) # on a new document: new_doc = "pretty obvious that when i write my tellall memoir someday there will be four to six" new_vec = dictionary.doc2bow(new_doc.lower().split()) print(lda.print_topic(0)) print(lda.show_topic(1)) print(lda.get_document_topics(new_vec))
def gensim_lda_topic_modelling(path, documents, num_of_topics=6, passes=50, verbose=True, plotTopicsResults=True): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] if verbose: print("Cleaned documents:\n", documents) print("\nDictionary:\n", dictionary) print("\nCorpus in BoW form: \n", corpus) start = time.time() ldamodel = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary) end = time.time() print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) ldatopics = ldamodel.show_topics(formatted=False) ldatopics_words = [[[word, prob] for word, prob in topic] for topicid, topic in ldatopics] if verbose: print("\nList of words associated with each topic:\n") for i in range(len(ldatopics_words)): print("\nTopic %d:\n" % i) for w, p in ldatopics_words[i]: print(p, " - ", w) if plotTopicsResults: plot_top_10_words_per_topic(path, ldatopics_words, num_topics=6, num_top_words=10) all_documents_topics = [ (doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in ldamodel.get_document_topics( corpus, per_word_topics=True) ] all_doc_topics = [] for i in range(len(all_documents_topics)): doc_topics, word_topics, phi_values = all_documents_topics[i] all_doc_topics.append( [doc_topics[i][1] for i in range(len(doc_topics))]) if verbose: print('Document topics:', doc_topics) print('Word topics:', word_topics) print('Phi values:', phi_values) print('-------------- \n') if plotTopicsResults: plot_share_of_topics(path, all_doc_topics, no_random_tweets=10) # Plot words coloured differently depending on the topic for doc in documents[0:100]: if len(doc) > 4: color_words(ldamodel, doc)
class W2V_cpp2(W2V_base): def __init__(self, n_topic, path_review, path_business, folder): self.n_topic = n_topic print('Init W2V_base') W2V_base.__init__(self, path_review, path_business, folder) print('Process idx2prod') #process dict for prod_id in self.idx2prod.keys(): prod = self.idx2prod[prod_id] n_prod_id = prod_id - len(self.word_count) - 1 del self.idx2prod[prod_id] self.idx2prod[n_prod_id] = prod self.prod2idx[prod] = n_prod_id print('Process idx2user') for user_id in self.idx2user.keys(): user = self.idx2user[user_id] n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1 del self.idx2user[user_id] self.idx2user[n_user_id] = user self.user2idx[user] = n_user_id print('Init W2V_cpp2 done') def train(self): data = [] entity2id = {} id2entity = [] print('Loading data') for obj in self.data: doc = [] obj_sents = obj["text_data"] entity = obj["prod"] if entity not in entity2id: entity2id[entity] = len(entity2id) id2entity.append(entity) doc_id = entity2id[entity] for obj_sent in obj_sents: for pair in obj_sent: if pair[0] >= 0: doc.append((pair[0], doc_id)) data.append(doc) print('Start training') self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic) print('Training complete, start exporting') f_entity = open(home + "/Data/yelp/lda/prod.txt", "w") f_model = open(home + "/Data/yelp/lda/model.txt", "w") f_model.write(str(len(entity2id))) f_model.write(" ") f_model.write(str(self.n_topic)) f_model.write("\n") for entity in id2entity: f_entity.write(entity) f_entity.write("\n") f_model.write(entity) f_model.write(" ") distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0) distr = [pair[1] for pair in distr] for prod in distr: f_model.write(str(prod)) f_model.write(" ") f_model.write("\n") self.ldamodel.save(home + "/Data/yelp/lda/model_200")
train_data, train_labels = parse("wikisection_dataset_json/wikisection_en_city_train.json") test_data, test_labels = parse("wikisection_dataset_json/wikisection_en_city_test.json") dct, bow = create_bow(train_data) print("Preprocessing is finished!") lda_model = LdaModel( corpus=bow, num_topics=NUM_TOPICS, id2word=dct) print("Lda is finished!") document_topics = [] for doc_bow in bow: ls = [] for top, prob in lda_model.get_document_topics(bow=doc_bow, minimum_probability=0.0): ls.append(prob) document_topics.append(ls) t_document_topics = [] for doc in test_data: doc_bow = dct.doc2bow(doc) ls = [] for top, prob in lda_model.get_document_topics(bow=doc_bow, minimum_probability=0.0): ls.append(prob) t_document_topics.append(ls) clf = svm.LinearSVC() clf.fit(document_topics, train_labels) y_pred = clf.predict(t_document_topics)