def get_lda_model(): """ (50,28767) 获得话题 :return: """ text_array = list() with open("jobs-unigrams-filter") as f: for line in tqdm(f): line = line.strip().split(" ") line.remove(line[0]) text_array.append(line) dictionary = Dictionary(text_array) # print(common_dictionary) common_corpus = [dictionary.doc2bow(text) for text in text_array] # Train the model on the corpus. lda = LdaModel(common_corpus, id2word=dictionary, num_topics=50, passes=10, iterations=1000) temp_file = datapath("LDA_twitter") lda.save(temp_file) topics = lda.get_topics() print(topics.shape) topic_list = lda.print_topics(50) for topic in topic_list: print(topic)
def save_topic_word_matrix(lda: LdaModel, name: str): matrix = lda.get_topics() threshold = 1 / matrix.shape[1] matrix = np.where(matrix < threshold, 0, matrix) matrix = sp.csr_matrix(matrix) sp.save_npz(name, matrix) return matrix
def __init__(self, normalized_text, visualize=False): _id = id(self) _name = self.__class__.__name__ _length = len(normalized_text) print(_name, _id, "Establishing bigrams") bi_gram = Phrases(normalized_text) normalized_text = [bi_gram[line] for line in normalized_text] print(_name, _id, "Establishing word dictionary") dictionary = Dictionary(normalized_text) dictionary_words = [ dictionary.doc2bow(text) for text in normalized_text ] print(_name, _id, "Constructing LDA model") lda_model = LdaModel(corpus=dictionary_words, num_topics=10, id2word=dictionary) if visualize: filename = 'visualization.json' visualization = pyLDAvis.gensim.prepare(lda_model, dictionary_words, dictionary) pyLDAvis.save_json(visualization, filename) with open(filename) as json_data: visual = json.load(json_data) self._topics = visual else: self._topics = lda_model.get_topics() print(_name, _id, "Topic modelling done")
def add_topical_network( result: res_pb.TopicQueryResult, topic_model: LdaModel, dictionary: Dictionary, graph_db: Sqlite3Graph, bow_db: Sqlite3Bow, ) -> None: """ Adds the topical_network field to the result proto. Creates this network by the weighted jacquard of topics. The source and target words are going to be assigned indices -1 and -2. """ # Size n_topics X voccab_size term_topic_mat = topic_model.get_topics() num_topics, vocab_size = term_topic_mat.shape source_word = estimate_plaintext_from_graph_key( graph_key=result.source, graph_db=graph_db, bow_db=bow_db, ) assert source_word is not None, \ f"Failed to find plaintext entry for {result.source}" source_word_idx = dictionary.token2id[source_word] source_graph_idx = -1 source_vec = np.zeros(vocab_size) source_vec[source_word_idx] = 1 target_word = estimate_plaintext_from_graph_key( graph_key=result.target, graph_db=graph_db, bow_db=bow_db, ) assert target_word is not None, \ f"Failed to find plaintext entry for {result.target}" target_word_idx = dictionary.token2id[target_word] target_graph_idx = -2 target_vec = np.zeros(vocab_size) target_vec[target_word_idx] = 1 graph_idx2vec = { topic_idx: term_topic_mat[topic_idx, :] for topic_idx in range(num_topics) } graph_idx2vec[source_graph_idx] = source_vec graph_idx2vec[target_graph_idx] = target_vec # Set all node names for idx in range(num_topics): result.topical_network.nodes[idx].name = f"Topic: {idx}" result.topical_network.nodes[source_graph_idx].name = \ f"Source: '{result.source}' -- '{source_word}'" result.topical_network.nodes[target_graph_idx].name = \ f"Source: '{result.target}' -- '{target_word}'" # Set all edges: for i, j, sim in _all_pairs_jaccard_comparisions(graph_idx2vec): result.topical_network.nodes[i].neighbors[j] = sim
def get_LDA_model(text_array): """ (30, 27445) """ dictionary = Dictionary(text_array) # print(common_dictionary) common_corpus = [dictionary.doc2bow(text) for text in text_array] # Train the model on the corpus. lda = LdaModel(common_corpus, id2word=dictionary, num_topics=30, passes=5, iterations=500) temp_file = datapath("LDA_twitter") lda.save(temp_file) topics = lda.get_topics() print(topics.shape)
class LDA(): def __init__(self, K, data, AMask, params, name, dataName): self.K = K # [int] nb of topics self.AMask = AMask # [n_a,n_d float] matrix of author participation to each paper (1 if author participated to paper) self.n_a, self.n_d = self.AMask.shape # [int] nb authors self.D = data self.n_dic, self.n_d = self.D.shape self.name = name self.train_C_ = [] self.train_param = params['train_param'] for d in range(self.n_d): self.train_C_.append([(k, self.D[k, d]) for k in range(self.n_dic)]) self.dataName = dataName def train(self): self.LDA = LdaModel(self.train_C_, num_topics=self.K, decay=0.5, offset=1024, passes=80) self.phi = self.LDA.get_topics().transpose() self.theta = np.zeros((self.K, self.n_d)) for d in range(self.n_d): tmp = self.LDA.get_document_topics(self.train_C_[d]) ind = [c for (c, b) in tmp] self.theta[ind, d] = [b for (c, b) in tmp] self.D_reb = self.phi.dot(self.theta) self.A = normalize(self.AMask, 'l1', 0) return () def save(self, path): ''' path example ''' toSave = {} toSave['theta'] = self.theta toSave['phi'] = self.phi toSave['A'] = self.A toSave['K'] = self.K toSave['train_param'] = self.train_param with open(path + self.name + '_' + self.dataName + '.pkl', 'wb') as output: pickle.dump(toSave, output, pickle.HIGHEST_PROTOCOL)
def lda_topics(processed_data: list, n_topics: int = 10, learning_decay: float = 0.5, learning_offset: float = 1.0, max_iter: int = 50, n_words: int = 10) -> Tuple[list, list]: """ lda_topics perfoms LDA topic modeling on the input data :param processed_data: list of preprocessed segments :param n_topics: number of topics to extract form corpus :param learning_decay: learning decay parameter for LDA :param learning_offset: learning offset parameter for LDA :param max_iter: max. number of interations :param n_words: number of topic representatives :return: - topics - list of topics (and their representatives - doc_topics - list of predicted topics, one for each segment """ dictionary = corpora.Dictionary(processed_data, ) doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_data] lda_model = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics, offset=learning_offset, random_state=42, update_every=1, iterations=max_iter, passes=10, alpha='auto', eta="auto", decay=learning_decay, per_word_topics=True) topics = [] for i_t, topic_word_dist in enumerate(lda_model.get_topics()): topic = [lda_model.id2word[w_id] for w_id, _ in lda_model.get_topic_terms(i_t, topn=n_words)] topics.append(topic) # getting documents topic labels doc_topics = [] for doc in doc_term_matrix: doc_t_dist = sorted(lda_model.get_document_topics(doc), key=lambda item: item[1], reverse=True) t, _ = doc_t_dist[0] doc_topics.append(t) assert len(doc_topics) == len(processed_data) return topics, doc_topics
class gensim_data(object): def __init__(self,mashup_descriptions, api_descriptions, mashup_categories=None, api_categories=None,tag_times=2,mashup_only=False,strict_train=False): self.mashup_only =mashup_only self.strict_train = strict_train # 整合text和tag信息:一个mashup/api的信息整合在一起,一行 if tag_times>0 and mashup_categories is not None: assert len(mashup_descriptions)==len(mashup_categories) self.mashup_dow=[[]]*len(mashup_descriptions) for i in range(len(mashup_descriptions)): self.mashup_dow[i]=mashup_descriptions[i] for j in range(tag_times): self.mashup_dow[i] += mashup_categories[i] # 直接将文本和tag拼接,是否有更好的方法?增加出现次数? else: self.mashup_dow = mashup_descriptions self.mashup_dow = [[str (index) for index in indexes] for indexes in self.mashup_dow] # 二维列表 # print (self.mashup_dow[0]) if tag_times>0 and api_categories is not None: assert len (api_descriptions) == len (api_categories) self.api_dow=[[]]*len(api_descriptions) for i in range(len(api_descriptions)): self.api_dow[i]=api_descriptions[i] for j in range(tag_times): self.api_dow[i]+=api_categories[i] else: self.api_dow=api_descriptions self.api_dow = [[str (index) for index in indexes] for indexes in self.api_dow] if not self.mashup_only and not self.strict_train: self.dct = Dictionary(self.mashup_dow + self.api_dow) if self.mashup_only and self.strict_train: # 训练用的mashup,api的编码 self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in dataset.crt_ds.his_mashup_ids] self.dct = Dictionary(self.train_mashup_dow) self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.train_mashup_dow] # 词id-数目 # 无论怎样,总要为每个mashup/api计算feature self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow] # 所有mashup文本的词id-数目 print('self.mashup_dow, num:',len(self.mashup_dow)) zero_num = sum([1 if len(mashup_info)==0 else 0 for mashup_info in self.mashup_dow]) print('zero_num',zero_num) self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow] # print('len of self.mashup_dow,self.api_dow:{},{}'.format(len(self.mashup_dow),len (self.api_dow))) self.num_topics =0 self.model = None # 处理文本的模型 self._mashup_features= None # 文本提取的特征向量 self._api_features= None self.mashup_topics = None # 文本最高的N个topic self.api_topics = None # 只关注词在文本中是否出现过,二进制,用于计算cos和jaccard def get_binary_v(self): dict_size=len(self.dct) mashup_binary_matrix=np.zeros((meta_data.mashup_num,dict_size)) api_binary_matrix = np.zeros ((meta_data.api_num, dict_size)) mashup_words_list=[] # 每个mashup中所有出现过的词 api_words_list = [] for i in range(meta_data.mashup_num): temp_words_list,_=zip(*self.mashup_dow[i]) mashup_words_list.append(temp_words_list) for j in temp_words_list:# 出现的词汇index mashup_binary_matrix[i][j]=1.0 for i in range(meta_data.api_num): temp_words_list,_=zip(*self.api_dow[i]) api_words_list.append(temp_words_list) for j in temp_words_list:# 出现的词汇index api_binary_matrix[i][j]=1.0 return mashup_binary_matrix,api_binary_matrix,mashup_words_list,api_words_list def model_pcs(self,model_name,LDA_topic_num=None): # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if self.mashup_only: if self.strict_train: train_corpus = self.train_mashup_dow else: train_corpus = self.mashup_dow else: if self.strict_train: train_corpus = self.train_mashup_dow + self.train_api_dow else: train_corpus = self.mashup_dow + self.api_dow if model_name=='HDP': self.model = HdpModel(train_corpus, self.dct) self.num_topics = self.model.get_topics ().shape[0] print('num_topics',self.num_topics) elif model_name=='TF_IDF': self.model =TfidfModel (train_corpus) self.num_topics=len(self.dct) elif model_name=='LDA': if LDA_topic_num is None: self.model = LdaModel(train_corpus) else: self.model = LdaModel(train_corpus,num_topics=LDA_topic_num) self.num_topics = self.model.get_topics ().shape[0] else: raise ValueError('wrong gensim_model name!') # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上) # print(self.mashup_dow) self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature # print(self.mashup_features) print('self.mashup_features, num:', len(self.mashup_features)) zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features]) print('zero_num1',zero_num1) for i in range(len(self.mashup_features)): if len(self.mashup_features[i])==0: print(self.mashup_dow[i]) self.api_features = [self.model[api_info] for api_info in self.api_dow] # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features))) self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics)) self._api_features = np.zeros((meta_data.api_num, self.num_topics)) for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array for index,value in self.mashup_features[i]: self._mashup_features[i][index]=value for i in range(meta_data.api_num): for index,value in self.api_features[i]: self._api_features[i][index]=value return self._mashup_features, self._api_features def get_topTopics(self,topTopicNum=3):# 选取概率最高的topK个主题 [(),(),...] mashup_topics = [] api_topics = [] for index in range(meta_data.mashup_num): sorted_mashup_feature = sorted(self.mashup_features[index],key = lambda x:x[1],reverse=True) try: topic_indexes,_ = zip(*sorted_mashup_feature) except: # 有时mashup_bow非空,但是mashup_feature为空 topic_indexes = random.sample(range(meta_data.mashup_num),topTopicNum) # print(self.mashup_dow[index]) # print(self.mashup_features[index]) # print(sorted_mashup_feature) # raise ValueError('wrong 138!') num = min(len(topic_indexes),topTopicNum) mashup_topics.append(topic_indexes[:num]) for index in range(meta_data.api_num): sorted_api_feature = sorted(self.api_features[index], key=lambda x: x[1], reverse=True) try: topic_indexes,_ = zip(*sorted_api_feature) except: topic_indexes = random.sample(range(meta_data.api_num), topTopicNum) num = min(len(topic_indexes),topTopicNum) api_topics.append(topic_indexes[:num]) return mashup_topics,api_topics
class gensim_data(object): def __init__(self, mashup_descriptions,mashup_categories,api_descriptions,api_categories,mashup_only=False,tag_times=2,strict_train=False): """ 使用gensim处理mashup和api的text和tag,得到特征表示,主题表示等 :param tag_times: 编码时是否使用tag,以及tag的次数 :param mashup_only: 是否只使用mashup信息: 用于LDA对mashup聚类 :param strict_train: 是否仅使用训练集的信息 """ self.strict_train = strict_train self.mashup_only = mashup_only self.num_topics = 0 self.model = None # 处理文本的模型 self.mashup_features,self.api_features = None,None # 使用模型处理文本得到稀疏特征向量 self.dense_mashup_features, self.dense_api_features = None, None # 处理后的密集特征向量 self.mashup_topics,self.api_topics = None,None # 文本最高的N个topic def initialize(): # 整合text和tag信息:一个mashup/api的信息整合在一起,一行 if tag_times > 0: assert len(mashup_descriptions) == len(mashup_categories) self.mashup_dow = [] for i in range(len(mashup_descriptions)): # 直接将文本和tag拼接,是否有更好的方法 self.mashup_dow.append(mashup_descriptions[i] + mashup_categories[i] * tag_times) else: self.mashup_dow = mashup_descriptions if tag_times > 0: assert len(api_descriptions) == len(api_categories) self.api_dow = [] for i in range(len(api_descriptions)): self.api_dow.append(api_descriptions[i] + api_categories[i] * tag_times) else: self.api_dow = api_descriptions if self.strict_train: # 训练用的mashup,api的编码 self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in data_repository.get_ds().his_mashup_ids] self.dct = Dictionary(self.train_mashup_dow) self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.train_mashup_dow] # 词id-数目 else: self.dct = Dictionary(self.mashup_dow + self.api_dow) # 为每个mashup/api计算feature self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow] # 所有mashup文本的词id-数目 self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow] # print('len of self.mashup_dow,self.api_dow:{},{}'.format(len(self.mashup_dow),len (self.api_dow))) initialize() def encode(self,docs): # 基于建立的词典,对文本编码 return list(map(self.dct.doc2idx,docs)) def get_feas(self,docs): # 编码并获得特征向量 dows = list(map(self.dct.doc2idx,docs)) feas = [self.model[dow] for dow in dows] return feas def get_all_encoded_comments(self): self.unpadded_encoded_mashup_texts = self.encode(get_iterable_values(data_repository.get_md().mashup_df,'final_description')) self.unpadded_encoded_mashup_tags = self.encode(get_iterable_values(data_repository.get_md().mashup_df,'Categories')) self.unpadded_encoded_api_texts = self.encode(get_iterable_values(data_repository.get_md().api_df,'final_description')) self.unpadded_encoded_api_tags = self.encode(get_iterable_values(data_repository.get_md().api_df,'Categories')) # 只关注词在文本中是否出现过,二进制,用于计算cos和jaccard def get_binary_v(self): dict_size = len(self.dct) mashup_binary_matrix = np.zeros((data_repository.get_md().mashup_num+1, dict_size)) api_binary_matrix = np.zeros((data_repository.get_md().api_num+1, dict_size)) mashup_words_list = [] # 每个mashup中所有出现过的词 api_words_list = [] for id in range(data_repository.get_md().mashup_num+1): temp_words_list, _ = zip(*self.mashup_dow[id]) mashup_words_list.append(temp_words_list) for j in temp_words_list: # 出现的词汇index mashup_binary_matrix[id][j] = 1.0 for id in range(data_repository.get_md().api_num+1): temp_words_list, _ = zip(*self.api_dow[id]) api_words_list.append(temp_words_list) for j in temp_words_list: # 出现的词汇index api_binary_matrix[id][j] = 1.0 return mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list def model_pcs(self, model_name, LDA_topic_num=None): # 模型处理,返回mashup和api的特征:对同一个语料,可以先后使用不同的模型处理 # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if self.mashup_only: if self.strict_train: train_corpus = self.train_mashup_dow else: train_corpus = self.mashup_dow else: if self.strict_train: train_corpus = self.train_mashup_dow + self.api_dow else: train_corpus = self.mashup_dow + self.api_dow if model_name == 'HDP': self.model = HdpModel(train_corpus, self.dct) self.num_topics = self.model.get_topics().shape[0] print('num_topics', self.num_topics) elif model_name == 'TF_IDF': self.model = TfidfModel(train_corpus) self.num_topics = len(self.dct) elif model_name == 'LDA': if LDA_topic_num is None: self.model = LdaModel(train_corpus) else: self.model = LdaModel(train_corpus, num_topics=LDA_topic_num) self.num_topics = self.model.get_topics().shape[0] else: raise ValueError('wrong gensim_model name!') # 使用模型处理文本得到稀疏特征向量,再转化为标准的np格式(每个topic上都有) # *** 由于mashup_dow和api_dow默认是全部mashup/api的文本,所以得到的特征列表用全局性的id索引即可 *** self.mashup_features = [self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature self.api_features = [self.model[api_info] for api_info in self.api_dow] self.dense_mashup_features = np.zeros((data_repository.get_md().mashup_num, self.num_topics)) self.dense_api_features = np.zeros((data_repository.get_md().api_num, self.num_topics)) for i in range(data_repository.get_md().mashup_num): # 部分维度有值,需要转化成规范array for index, value in self.mashup_features[i]: self.dense_mashup_features[i][index] = value for i in range(data_repository.get_md().api_num): for index, value in self.api_features[i]: self.dense_api_features[i][index] = value return self.dense_mashup_features, self.dense_api_features def get_topTopics(self, topTopicNum=3): # 选取概率最高的topK个主题 [(),(),...] mashup_topics = [] api_topics = [] for index in range(data_repository.get_md().mashup_num): sorted_mashup_feature = sorted(self.mashup_features[index], key=lambda x: x[1], reverse=True) try: topic_indexes, _ = zip(*sorted_mashup_feature) except: # 有时mashup_bow非空,但是mashup_feature为空 topic_indexes = random.sample(range(data_repository.get_md().mashup_num), topTopicNum) num = min(len(topic_indexes), topTopicNum) mashup_topics.append(topic_indexes[:num]) for index in range(data_repository.get_md().api_num): sorted_api_feature = sorted(self.api_features[index], key=lambda x: x[1], reverse=True) try: topic_indexes, _ = zip(*sorted_api_feature) except: topic_indexes = random.sample(range(data_repository.get_md().api_num), topTopicNum) num = min(len(topic_indexes), topTopicNum) api_topics.append(topic_indexes[:num]) return mashup_topics, api_topics
def abandon(): stopWords = set(stopwords.words('english')) for w in string.punctuation: stopWords.add(w) stops_words = [ "rt", "…", "...", "URL", "http", "https", "“", "”", "‘", "’", "get", "2", "new", "one", "i'm", "make", "go", "good", "say", "says", "know", "day", "..", "take", "got", "1", "going", "4", "3", "two", "n", "like", "via", "u", "would", "still", "first", "really", "watch", "see", "even", "that's", "look", "way", "last", "said", "let", "twitter", "ever", "always", "another", "many", "things", "may", "big", "come", "keep", "5", "time", "much", "want", "think", "us", "love", "people", "need" ] for w in stops_words: stopWords.add(w) tokenizer = CustomTweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False, normalize_usernames=False, normalize_urls=True, keep_allupper=False) cnt = Counter() texts = [] # comm = json.load(open("data/louvain_rst.json")) # users_comm = {str(u) for u in comm if comm[u] == 0} # print(len(users_comm)) # loading data data = pd.read_csv("data/ira-tweets-ele.csv", usecols=["tweet_text", "userid"]) for i, row in tqdm(data.iterrows()): # if row["userid"] not in users_comm: # continue words = tokenizer.tokenize(row["tweet_text"]) words = [w for w in words if w not in stopWords and w] # if words[0] == "RT": # continue for w in words: cnt[w] += 1 texts.append(words) print(len(texts)) json.dump(cnt.most_common(), open("data/word_cloud.json", "w"), indent=2) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(t) for t in texts] def average_distance(v_tops): _sum = 0 _cnt = 0 for i in range(len(v_tops)): for j in range(i + 1, len(v_tops)): _sum += scipy.spatial.distance.cosine(v_tops[i], v_tops[j]) _cnt += 1 return _sum / _cnt with open("data/IRA_topics.txt", "w") as f: for n in range(2, 12): print(f"N = {n}") lda = LdaModel(corpus, num_topics=n, random_state=42) v_topics = lda.get_topics() lda.save(f"model/lda-ira-{n}.mod") # pprint(lda.print_topics()) f.write(f"Perplexity: {lda.log_perplexity(corpus)}" ) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda, texts=corpus, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() f.write(f"Coherence Score: {coherence_lda}") f.write(f"~Average distance: {average_distance(v_topics)}\n") # show x = lda.show_topics(num_topics=n, num_words=20, formatted=False) topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] dictionary.id2token = { v: k for k, v in dictionary.token2id.items() } # Below Code Prints Topics and Words for topic, words in topics_words: f.write( str(topic) + " :: " + str([dictionary.id2token[int(w)] for w in words]) + "\n") f.write("\n")
id2word = tokenizer.decoder ###################################### # Set training parameters. num_topics = 20 chunksize = 2000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. model = LdaModel(corpus=topical_dataset, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) top_topics = model.top_topics(topical_dataset) np.save("topical_dataset_topics.npy", model.get_topics()) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) from pprint import pprint pprint(top_topics) pickle.dump(top_topics, open("top_topical_dataset_topics.p", "wb"))
# data preparation train_C_ = [] for d in range(n_doc): train_C_.append([(k, M_train[k, d]) for k in range(n_dic)]) #%% l_k = [20] #[20,50,100,150] n_k = len(l_k) store = np.zeros((n_k, 5)) #%% i = 0 for K in l_k: t = time.time() LDA = LdaModel(train_C_, num_topics=K) phiLDA = LDA.get_topics().transpose() thetaLDA = np.zeros((K, n_doc)) for d in range(n_doc): tmp = LDA.get_document_topics(train_C_[d]) ind = [c for (c, b) in tmp] thetaLDA[ind, d] = [b for (c, b) in tmp] t1 = time.time() print('LDA for k = ' + str(K), ', time = ' + str(t1 - t)) t = t1 # aLDA on LDA init = {} init['A'] = np.eye(n_doc) init['theta'] = thetaLDA init['phi'] = phiLDA aLDALDA = aLDA_estimator(K, M_train, np.eye(n_doc), 5, 5, 1, True, init)
# Set training parameters. num_topics = 20 chunksize = 2000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. model = LdaModel( corpus=wiki, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every ) top_topics = model.top_topics(wiki) np.save("wiki_topics.npy", model.get_topics()) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) from pprint import pprint pprint(top_topics) pickle.dump(top_topics, open("top_topics.p", "wb"))
print(lda.get_document_topics(test)) print(lda[test]) # 参数(word_id, minimum_probability=None) # 关联的topics for the given word. # Each topic is represented as a tuple of (topic_id, term_probability). print(lda.get_term_topics(0)) # ----- 输出指定topic的构成 ----- # 参数(word_id, minimum_probability=None) # 输出形式 list, format: [(word, probability), … ]. print(lda.get_topic_terms(0)) # 参数(topicno, topn=10) print(lda.show_topic(0)) # 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘. # 参数(topicno, topn=10) print(lda.print_topic(0)) # ----- 输出所有topic的构成 ----- # 默认参数(num_topics=10, num_words=10, log=False, formatted=True) # 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...] print(lda.show_topics()) # [num_topics, vocabulary_size] array of floats (self.dtype) # which represents the term topic matrix learned during inference. print(lda.get_topics()) # ----- save and load model ----- lda.save(fname="lda_model") lda.load(fname="lda_model") print(lda[test])
def get_topic_word_matrix(lda: LdaModel) -> np.ndarray: return lda.get_topics()
k_list = [10,15,20,30] nb_k = len(k_list) aLDA_store_train = np.zeros((nb_fold,nb_k)) aLDA_store_test = np.zeros((nb_fold,nb_k)) LDA_store_train = np.zeros((nb_fold,nb_k)) LDA_store_test = np.zeros((nb_fold,nb_k)) aLDAgen = aLDA_generator(n_dic, n_w, A_mask, K, alpha, beta, gamma) aLDAgen.itialise() for f in range(nb_fold): train_Z,train_C,train_D,train_C_ = aLDAgen.generate() for k in range(nb_k): aLDA = aLDA_estimator(k_list[k], train_C, A_mask, 3, 3, 1, False) aLDA.gd_ll(0.05, 60, 0,0.0,0,1) LDA = LdaModel(train_C_, num_topics=k_list[k]) phiGen = LDA.get_topics().transpose() thetaGen = 0*aLDA.thetaStar for d in range(n_a): tmp = LDA.get_document_topics(train_C_[d]) ind = [c for (c,b) in tmp] thetaGen[ind,d] = [b for (c,b) in tmp] aLDA_store_train[f,k] = aLDA.llgd[-1,0] LDA_store_train[f,k] = loglikaLDA(thetaGen, phiGen, A_mask, train_D, alpha, beta,1) aLDA_store_test[f,k] = np.sum(np.sum(np.log(aLDA.phiStar.dot(aLDA.thetaStar).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w LDA_store_test[f,k] = np.sum(np.sum(np.log(phiGen.dot(thetaGen).dot(aLDA.AStar))*(aLDAgen.phi.dot(aLDAgen.theta).dot(aLDAgen.A))))*n_w print(f) #plt.plot(aaa.llgd[:,0]) #print('elapsed'+str(time.time()-t1))
tpl = lda.print_topics(num_topics=6, num_words=5) topic, contrib = zip(*tpl) DTdist = pd.DataFrame( contrib, columns=[ "Top 5 words that contribute to each topic with associated probability" ], index=indx) distLatex = DTdist.to_latex(index=True, index_names="Topics") # document distribution doc_distribution = np.array([ tup[1] for tup in lda.get_document_topics(bow=corpus, per_word_topics=False) ]) obj = lda.get_topics() a = lda.inference(corpus) print(doc_distribution[:853]) # training corpus document by topic matrix doc_topic_dist_corpus = np.array([[tup[1] for tup in lst] for lst in lda[corpus]]) save_obj(lda, 'LDA_MODEL_APPLICATION') #%% lda = load_obj('LDA_MODEL_APPLICATION') fig, axes = plt.subplots(2, 3, figsize=(20, 10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) plt.imshow( WordCloud(background_color="white").fit_words( dict(lda.show_topic(i, 200))))
num_topics=num_topics, chunksize=chunksize, passes=passes, iterations=iterations, eval_every=eval_every #alpha='auto', #eta='auto', ) #top_topics = model.top_topics(corpus) #print(top_topics) #print(type(top_topics)) #avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics #print('Average topic coherence: %.4f.' % avg_topic_coherence) tops = model.get_topics() np.savetxt("lda_output.csv", tops, delimiter=",") # print(len(tops[0])) # print(tops) #from pprint import pprint #pprint(top_topics) POLE_topics = {'has_POLE': {}, 'no_POLE': {}} MSI_topics = {'MSI_high': {}, 'MSI_medium': {}, 'MSI_low': {}} with open('counts_with_metadata.csv', mode='r') as f: reader = csv.reader(f) header = reader.__next__() index = 0
cluster_range, n_rand_init=1) # plot_max_modularities(modularities, cluster_range) nbre_cluster = modularities.index(max(modularities)) + cluster_range[0] liste_doc_traite = pickle.load(open("docment_traites.p", 'rb')) corpus = [ objet_texte.txt_lemma_str for doc in liste_doc_traite for (objet_texte, nature, docID) in doc ] bow = Text2BowTransformer() corpus_2 = bow.fit_transform(corpus) model_lda = LdaModel(corpus_2, num_topics=nbre_cluster, id2word=bow.gensim_model) topics_lda = model_lda.get_topics() model_lsi = LsiModel(corpus_2, num_topics=nbre_cluster, id2word=bow.gensim_model) dico_id2mot = bow.gensim_model.id2token def subsampling(*arg): m = max(arg) rep = [False] * len(arg) ind = arg.index(m) rep[ind] = m return rep def mot_clustering(model, dico):
print('Reading dataset') data = pd.read_parquet(args.input_filepath) print('Normalizing text') data.text = data.text.map(nlp.normalize_text) print('Building docterm matrix') docterm, dictionary = nlp.get_docterm_matrix(data.text) doclength = np.array([sum(x[1] for x in doc) for doc in docterm]) print('Training LDA model') lda = LdaModel(docterm, num_topics=args.n_topics) print('Getting document topics') doctopics = corpus2csc([lda.get_document_topics(doc) for doc in docterm]) termtopics = lda.get_topics() print('Computing topic volume time series') topic_volume_over_time = nlp.get_topic_volume_over_time(data, doctopics, 20) print('Computing topic coordinates') topic_coordinates = nlp.get_topic_coordinates(termtopics, method='mds') topic_proportions = nlp.get_topic_proportions(doctopics, doclength) print('Computing term frequencies') term_frequencies = nlp.get_term_frequencies(docterm, termtopics, topic_proportions, doclength) print('Computing term ranks per topic') term_ranks = nlp.get_topic_term_ranks(docterm, termtopics)
def lda_train(p_generate, theta_generate, phi_generate, num_topics, num_docs): import matplotlib.pyplot as plt from gensim.models import LdaModel, LdaMulticore import gensim.downloader as api from gensim.utils import simple_preprocess, lemmatize import nltk from nltk.corpus import stopwords from gensim import corpora import re import pyLDAvis import logging import numpy as np import scipy import sys from itertools import permutations from gensim.models import CoherenceModel np.set_printoptions(threshold=sys.maxsize) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) if __name__ == '__main__': __spec__ = None #Load dictionnary and corpus dct = corpora.Dictionary.load('dct.dict') corpus = corpora.MmCorpus('corpus.mm') num_words = len(dct) # Step 4: Train the LDA model lda_model = LdaModel(corpus=corpus, id2word=None, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True, minimum_probability=0) # save the model lda_model.save('lda_model.model') # See the topics i = 0 theta_matrix = np.zeros((num_docs, num_topics)) for c in lda_model[corpus]: print(i) print("Document Topics : ", c[0]) # [(Topics, Perc Contrib)] for j in range(theta_matrix.shape[1]): theta_matrix[i, j] = c[0][j][1] i = i + 1 # print("Word id, Topics : ", c[1][:]) # [(Word id, [Topics])] #print("Phi Values (word id) : ", c[2][:]) # [(Word id, [(Topic, Phi Value)])] # print("Word, Topics : ", [(dct[wd], topic) for wd, topic in c[1][:]]) # [(Word, [Topics])] # print("Phi Values (word) : ", [(dct[wd], topic) for wd, topic in c[2][:]]) # [(Word, [(Topic, Phi Value)])] # print("------------------------------------------------------\n") for j in range(num_topics): print("Topic", j) for i in range(len(lda_model.get_topic_terms(j, 10))): print(dct[lda_model.get_topic_terms(j, 10)[i][0]], lda_model.get_topic_terms(j, 10)[i][1]) phi_matrix = lda_model.get_topics() row_sums = theta_matrix.sum(axis=1) theta_matrix_new = theta_matrix / row_sums[:, np.newaxis] p = np.matmul(theta_matrix_new, phi_matrix) p_logit = scipy.special.logit(p) for i in range(p_logit.shape[0]): print(i) print(p_logit[i, ]) p_logit_generate = np.load('p_logit_generate.npy') p_generate = np.load('p_generate.npy') theta_generate = np.load('theta_generate.npy') phi_generate = np.load('phi_generate.npy') corr_p = np.zeros((1, num_docs)) corr_p_logit = np.zeros((1, num_docs)) cosine_p = np.zeros((1, num_docs)) for i in range(p_logit.shape[0]): corr_p_logit[0, i] = np.corrcoef(p_logit[i, ], p_logit_generate[i, ])[1, 0] corr_p[0, i] = np.corrcoef(p[i, ], p_generate[i, ])[1, 0] cosine_p[0, i] = scipy.spatial.distance.cosine(p[i, ], p_generate[i, ]) corr_avg_p_inter = np.mean(corr_p) cosine_avg_p_inter = np.mean(cosine_p) corr_avg_p_logit_inter = np.mean(corr_p_logit) corr_avg_p_wordDist = np.mean( np.corrcoef(p) ) #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc) corr_avg_p_docDist = np.mean( np.corrcoef(np.transpose(p)) ) #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen) corr_avg_pgenerate_wordDist = np.mean( np.corrcoef(p_generate) ) #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc) corr_avg_pgenerate_docDist = np.mean( np.corrcoef(np.transpose(p_generate)) ) #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen) theta = theta_matrix_new phi = phi_matrix #This section is to compile to correlation and cosine of each column arrangment combination of a 3 topic model (theta_matrix) compilation_corr_theta = [] compilation_cosine_theta = [] compilation_corr_phi = [] compilation_cosine_phi = [] compilation_KL_theta = [] compilation_KL_phi = [] l = list(permutations(range(1, num_topics + 1))) for combi in range(len(l)): v_theta = np.zeros([num_docs, num_topics]) v_phi = np.zeros([num_topics, num_words]) for tid in range(num_topics): v_theta[:, tid] = theta[:, l[combi][tid] - 1] v_phi[tid, :] = phi[l[combi][tid] - 1, :] corr_theta = np.zeros((1, num_docs)) cosine_theta = np.zeros((1, num_docs)) KL_theta = np.zeros((1, num_docs)) corr_phi = np.zeros((1, num_topics)) cosine_phi = np.zeros((1, num_topics)) KL_phi = np.zeros((1, num_topics)) for i in range(theta_generate.shape[0]): corr_theta[0, i] = np.corrcoef(v_theta[i, :], theta_generate[i, :])[1, 0] cosine_theta[0, i] = scipy.spatial.distance.cosine( v_theta[i, :], theta_generate[i, :]) KL_theta[0, i] = scipy.stats.entropy(theta_generate[i, :], v_theta[i, :]) compilation_corr_theta.append(corr_theta.mean()) compilation_cosine_theta.append(cosine_theta.mean()) compilation_KL_theta.append(KL_theta.mean()) for i in range(phi_generate.shape[0]): corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0] cosine_phi[0, i] = scipy.spatial.distance.cosine( v_phi[i, :], phi_generate[i, :]) KL_phi[0, i] = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :]) compilation_corr_phi.append(corr_phi.mean()) compilation_cosine_phi.append(cosine_phi.mean()) compilation_KL_phi.append(KL_phi.mean()) compilation_cosine_phi = np.array(compilation_cosine_phi) compilation_corr_phi = np.array(compilation_corr_phi) compilation_KL_phi = np.array(compilation_KL_phi) compilation_cosine_theta = np.array(compilation_cosine_theta) compilation_corr_theta = np.array(compilation_corr_theta) compilation_KL_theta = np.array(compilation_KL_theta) alignment = compilation_KL_phi.argmin() if alignment != compilation_cosine_phi.argmin( ) | alignment != compilation_cosine_theta.argmin( ) | alignment != compilation_corr_theta.argmax( ) | alignment != compilation_corr_phi.argmax( ) | alignment != compilation_KL_theta.argmin(): print('Warning!!! The alignments are not coherents.') #Determining the final correlation and cosine values v_theta = np.zeros([num_docs, num_topics]) v_phi = np.zeros([num_topics, num_words]) for tid in range(num_topics): v_theta[:, tid] = theta[:, l[alignment][tid] - 1] v_phi[tid, :] = phi[l[alignment][tid] - 1, :] corr_theta = np.zeros((1, num_docs)) cosine_theta = np.zeros((1, num_docs)) KL_theta = np.zeros((1, num_docs)) corr_phi = np.zeros((1, num_topics)) cosine_phi = np.zeros((1, num_topics)) KL_phi = np.zeros((1, num_topics)) for i in range(theta_generate.shape[0]): corr_theta[0, i] = np.corrcoef(v_theta[i, :], theta_generate[i, :])[1, 0] cosine_theta[0, i] = scipy.spatial.distance.cosine( v_theta[i, :], theta_generate[i, :]) KL_theta = scipy.stats.entropy(theta_generate[i, :], v_theta[i, :]) for i in range(phi_generate.shape[0]): corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0] cosine_phi[0, i] = scipy.spatial.distance.cosine(v_phi[i, :], phi_generate[i, :]) KL_phi = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :]) corr_theta = corr_theta.mean() cosine_theta = cosine_theta.mean() KL_theta = KL_theta.mean() corr_phi = corr_phi.mean() cosine_phi = cosine_phi.mean() KL_phi = KL_phi.mean() words_id = np.arange(num_words) #coherence_model_lda=CoherenceModel(model=lda_model,texts=corpus,dictionary=dct,coherence='c_v') #coherence_lda=coherence_model_lda.get_coherence() #print('\nCoherence Score: ', coherence_lda) return (v_phi, corr_theta, corr_phi, cosine_theta, cosine_phi, KL_theta, KL_phi)