def visualize(m, show_emb=True): if not os.path.exists('./results'): os.makedirs('./results') m.eval() queries = [] ## visualize topics using monte carlo with torch.no_grad(): print('#' * 100) print('Visualize topics...') topics_words = [] gammas = m.get_beta() for k in range(args.num_topics): gamma = gammas[k] top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {}: {}'.format(k, topic_words)) if show_emb: ## visualize word embeddings by using V to get nearest neighbors print('#' * 100) print('Visualize word embeddings by using output embedding matrix') try: embeddings = m.rho.weight # Vocab_size x E except: embeddings = m.rho # Vocab_size x E neighbors = [] for word in queries: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocab))) print('#' * 100)
def visualize(model, num_topics=num_topics, num_words=num_words, vocab=idx2word, show_emb=True, tokenizer=tokenizer, bert_model=bert): """ This is a cool visualisation function. Takes as input the model so far and shows the discovered embeddings! """ model.eval() # set the net in evaluation mode # set a few words to query queries = [ 'insurance', 'weather', 'particles', 'religion', 'man', 'love', 'intelligence', 'money', 'politics', 'health', 'people', 'family' ] ## visualize topics using monte carlo (sampling from the posterior I guess) with torch.no_grad( ): # no gradients computation - makes forward pass lighter print('-' * 20) print('Visualize topics...') topics_words = [] gammas = model.get_beta() # topics distributions for k in range(num_topics): gamma = gammas[k] top_words = list(gamma.cpu().numpy().argsort()[-num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {}: {}'.format(k, topic_words)) if show_emb: ## visualize word embeddings by using V to get nearest neighbors print('-' * 20) print('Visualize word embeddings by using output embedding matrix') # extract the embeddings from the model! try: embeddings = model.rho.weight # Vocab_size x E except: embeddings = model.rho # Vocab_size x E for word in queries: # extracting Bert representation of the word inputs = tokenizer(word, return_tensors="pt") outputs = bert_model(**inputs).last_hidden_state[0] outputs.requires_grad = False if outputs.size()[0] > 1: # aggregate outputs = torch.sum(outputs, dim=0) nns = utils.nearest_neighbors(q=outputs, embeddings=embeddings, vocab=list(vocab.values())) print('word: {} .. neighbors: {}'.format( word, nns)) # utility function
def visualize(self, args, vocabulary, show_emb=False): Path.cwd().joinpath("results").mkdir(parents=True, exist_ok=True) self.eval() model_path = Path.home().joinpath( "Projects", "Personal", "balobi_nini", 'models', 'embeddings_one_gram_fast_tweets_only').__str__() model_gensim = FT_gensim.load(model_path) # need to update this .. queries = [ 'felix', 'covid', 'pprd', '100jours', 'beni', 'adf', 'muyembe', 'fally' ] ## visualize topics using monte carlo results_file_name = "topic_results_{}_{}.txt".format( args.batch_size, args.epochs) results_file_name = Path.cwd().joinpath("results", results_file_name) with torch.no_grad(): print('#' * 100) print('Visualize topics...') topics_words = [] gammas = self.get_beta() for k in range(args.num_topics): gamma = gammas[k] top_words = list( gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocabulary[a].strip() for a in top_words] topics_words.append(' '.join(topic_words)) with open(results_file_name, "a") as results_file: results_file.write('Topic {}: {}\n'.format(k, topic_words)) with open(results_file_name, "a") as results_file: results_file.write( 10 * '#' + '\n') # But this could have been done as a function if show_emb: ## visualize word embeddings by using V to get nearest neighbors print('#' * 100) print( 'Visualize word embeddings by using output embedding matrix' ) try: embeddings = self.rho.weight # Vocab_size x E except: embeddings = self.rho # Vocab_size x E neighbors = [] for word in queries: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(model_gensim, word))) print('#' * 100)
def visualize(m, show_emb=True): #可视化模型 #如何根据代码中的变量来对文章提取主题。 #整篇论文的目的,就是求得self.rho和beta。再对这论文看一次代码。 if not os.path.exists('./results'): os.makedirs('./results') m.eval() queries = [ 'andrew', 'computer', 'sports', 'religion', 'man', 'love', 'intelligence', 'money', 'politics', 'health', 'people', 'family' ] ## 可视化主题 with torch.no_grad(): print('#' * 100) print('Visualize topics...') topics_words = [] gammas = m.get_beta() # 也就是m.get_beta()的作用是得到主题关于此的分布。这也是个向量。 #现在的理解是gammas每个主题的向量。5 * 3072维的。5行是有五个主题,3072是单词的个数。 for k in range(args.num_topics): gamma = gammas[k] top_words = list( gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) #对所得到的词排序。 topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {}: {}'.format(k, topic_words)) if show_emb: ## visualize word embeddings by using V to get nearest neighbors ## 展示每个单词的上下文。使用上下文来做CBOW来实现对word 2 vec print('#' * 100) print('Visualize word embeddings by using output embedding matrix') # 输出中有一个vectors : (3072, 300) 和 query:(300,)这两个东西,是从哪来的。我知道这个元组的意义是3072的输入向量转成300维的词向量。 try: embeddings = m.rho.weight # Vocab_size x E except: embeddings = m.rho # Vocab_size x E # embeddings表示的是词向量的维度,3072 * 300 neighbors = [] for word in queries: print('word: {} neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocab))) print('#' * 100)
def visualize(m, show_emb=True): if not os.path.exists('./results'): os.makedirs('./results') m.eval() #queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love', # 'intelligence', 'money', 'politics', 'health', 'people', 'family'] queries = [ "sentence", "punishment", "guilt", "murder", "vote", "woman", "man", "innocent", "London", "crime", "female", "slave", "chattle", "foreigner", "foreign", "theft", "robbery", "rape", "thievery", "larceny", "burglary", "assault", "hanging", "prison", "convict" ] ## visualize topics using monte carlo with torch.no_grad(): print('#' * 100) print('Visualize topics...') topics_words = [] gammas = m.get_beta() for k in range(args.num_topics): gamma = gammas[k] top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {}: {}'.format(k, topic_words)) if show_emb: ## visualize word embeddings by using V to get nearest neighbors print('#' * 100) print('Visualize word embeddings by using output embedding matrix') try: embeddings = m.rho.weight # Vocab_size x E except: embeddings = m.rho # Vocab_size x E neighbors = [] for word in queries: try: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocab))) except ValueError: print(word + " not in queries. Skipping...") continue print('#' * 100)
def visualize(m, show_emb=True): if not os.path.exists('./results'): os.makedirs('./results') m.eval() queries = [ 'medical', 'computer', 'sports', 'religion', 'man', 'love', 'intelligence', 'money', 'politics', 'health', 'people', 'family' ] ## visualize topics using monte carlo with torch.no_grad(): print('#' * 100) print('Visualize topics...') topics_words = [] gammas = m.get_beta() for k in range(args.num_topics): gamma = gammas[k] top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {}: {}'.format(k, topic_words)) with open( '/mnt/nas6/users/wangweixuan/ETM/results/topic_words_de.txt', 'a') as f: f.write(str(k)) f.write(str(topic_words)) f.write('\n') f.close() if show_emb: ## visualize word embeddings by using V to get nearest neighbors print('#' * 100) print('Visualize word embeddings by using output embedding matrix') try: embeddings = m.rho.weight # Vocab_size x E except: embeddings = m.rho # Vocab_size x E neighbors = [] for word in queries: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocab))) print('#' * 100)
def visualize(self, vocab, show_emb=True): if not os.path.exists('./results'): os.makedirs('./results') self.eval() # Examples of English queries # queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love', # 'intelligence', 'money', 'politics', 'health', 'people', 'family'] queries = ['محمد'] # visualize topics using monte carlo with torch.no_grad(): print('#' * 100) print('Visualize topics...') topics_words = [] gammas = self.get_beta() for k in range(self.config_dict['model_params']['num_topics']): gamma = gammas[k] top_words = list( gamma.cpu().numpy().argsort() [-self.config_dict['evaluation_params']['num_words'] + 1:][::-1]) topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {}: {}'.format(k, topic_words)) if show_emb: # visualize word embeddings by using V to get nearest neighbors print('#' * 100) print( 'Visualize word embeddings by using output embedding matrix' ) try: embeddings = self.rho.weight # Vocab_size x E except: embeddings = self.rho # Vocab_size x E neighbors = [] for word in queries: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocab))) print('#' * 100)
def visualize(self, args, vocabulary): """Visualizes topics and embeddings and word usage evolution. """ self.eval() with torch.no_grad(): alpha = self.mu_q_alpha beta = self.get_beta(alpha) print('beta: ', beta.size()) print('\n') print('#' * 100) print('Visualize topics...') times = [0, 10, 40] topics_words = [] for k in range(args.num_topics): for t in times: gamma = beta[k, t, :] top_words = list( gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocabulary[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {} .. Time: {} ===> {}'.format( k, t, topic_words)) print('\n') print('Visualize word embeddings ...') queries = [ 'economic', 'assembly', 'security', 'management', 'debt', 'rights', 'africa' ] try: embeddings = self.rho.weight # Vocab_size x E except Exception: embeddings = self.rho # Vocab_size x E neighbors = [] for word in queries: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocabulary, args.num_words))) print('#' * 100)
def visualize(m, show_emb=True): if not os.path.exists('./results'): os.makedirs('./results') m.eval() # queries = ['andrew', 'computer', 'sports', 'religion', 'man', 'love', # 'intelligence', 'money', 'politics', 'health', 'people', 'family'] # queries = ['biology', 'gene', 'calling', 'cancer', 'experiment'] queries = ['biology'] ## visualize topics using monte carlo with torch.no_grad(): print('#' * 100) print('Visualize topics...') topics_words = [] gammas = m.get_beta() for k in range(args.num_topics): gamma = gammas[k] top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {}: {}'.format(k, topic_words)) if show_emb: ## visualize word embeddings by using V to get nearest neighbors print('#' * 100) print('Visualize word embeddings by using output embedding matrix') try: embeddings = m.rho.weight # Vocab_size x E except: embeddings = m.rho # Vocab_size x E neighbors = [] for word in queries: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocab))) print('#' * 100)
def visualize(epoch, writer): """Visualizes topics and embeddings and word usage evolution. """ model.eval() with torch.no_grad(): alpha = model.mu_q_alpha beta = model.get_beta(alpha) print('beta: ', beta.size()) print('\n') print('#' * 100) print('Visualize topics...') times = [0, 10, 40] topics_words = [] for k in range(args.num_topics): for t in times: gamma = beta[k, t, :] top_words = list( gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] topics_words.append(' '.join(topic_words)) print('Topic {} .. Time: {} ===> {}'.format(k, t, topic_words)) print('\n') print('Visualize word embeddings ...') queries = [ 'economy', 'vietnam', 'islam', 'climate', 'debt', 'electricity', 'africa' ] try: embeddings = model.rho.weight # Vocab_size x E except: embeddings = model.rho # Vocab_size x E neighbors = [] for word in queries: print('word: {} .. neighbors: {}'.format( word, nearest_neighbors(word, embeddings, vocab, args.num_words))) print('#' * 100)
def train_knn_gcn(self, args): features, nfeats, labels, nclasses, train_mask, val_mask, test_mask = load_data( args) val_accuracies = [] test_accuracies = [] Adj = torch.from_numpy( nearest_neighbors(features, args.k, args.knn_metric)).cuda() Adj = normalize(Adj, args.normalization, args.sparse) if torch.cuda.is_available(): features = features.cuda() if args.half_val_as_train: val_mask, train_mask = self.half_val_as_train(val_mask, train_mask) for trial in range(args.ntrials): val_accu, test_accu, best_model = self.train_classification_gcn( Adj, features, nfeats, labels, nclasses, train_mask, val_mask, test_mask, args) val_accuracies.append(val_accu.item()) test_accuracies.append(test_accu.item()) self.print_results(val_accuracies, test_accuracies)
print('batch: {}/{}'.format(idx, len(indices))) thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt print('\nThe 10 most used topics are {}'.format( thetaWeightedAvg.argsort()[::-1][:10])) ## show topics beta = model.get_beta() topic_indices = list(np.random.choice(args.num_topics, 10)) # 10 random topics print('\n') for k in range(args.num_topics): #topic_indices: gamma = beta[k] top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] print('Topic {}: {}'.format(k, topic_words)) if args.train_embeddings: ## show etm embeddings try: rho_etm = model.rho.weight.cpu() except: rho_etm = model.rho.cpu() queries = [] print('\n') print('ETM embeddings...') for word in queries: print('word: {} .. etm neighbors: {}'.format( word, nearest_neighbors(word, rho_etm, vocab))) print('\n')
thetaWeightedAvg.argsort()[::-1][:10])) # Now we show the topics # A nice visualisation is always welcome beta = etm_model.get_beta() topic_indices = list(np.random.choice(num_topics, 10)) # 10 random topics print('\n') for k in range(num_topics): # topic_indices: gamma = beta[k] top_words = list(gamma.cpu().numpy().argsort()[-num_words + 1:][::-1]) topic_words = [idx2word[a] for a in top_words] print('Topic {}: {}'.format(k, topic_words)) # Why not, also showing a few embeddings if train_embeddings: # get embeddings from the model try: rho_etm = etm_model.rho.weight.cpu() except: rho_etm = etm_model.rho.cpu() queries = [ 'andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love', 'intelligence', 'money', 'politics', 'health', 'people', 'family' ] print('\n') print('ETM embeddings...') for word in queries: print('word: {} .. etm neighbors: {}'.format( word, utils.nearest_neighbors(word, rho_etm, idx2word))) print('\n')
topic_words = [vocab[a] for a in top_words] print('Topic {}: {}'.format(k, topic_words)) if args.train_embeddings: ## show etm embeddings try: rho_etm = model.rho.weight.cpu() except: rho_etm = model.rho.cpu() # queries = ['andrew', 'woman', 'computer', 'sports', 'religion', 'man', 'love', # 'intelligence', 'money', 'politics', 'health', 'people', 'family'] queries = [] print('\n') print('ETM embeddings...') for word in queries: print('word: {} .. etm neighbors: {}'.format(word, nearest_neighbors(word, rho_etm, vocab))) print('\n') indices = torch.tensor(range(args.num_docs_test)) indices = torch.split(indices, 1) topics = [] for idx, ind in enumerate(indices): print(idx, ind) data_batch = data.get_batch(test_tokens, test_counts, ind, args.vocab_size, device) topic = predict(data_batch, args.num_topics, model) topics.append(topic) print(topics) #keys_list = ["Υγεία", "Ψυχαγωγία", "Υγεία", "Κόσμος", "Υγεία", "Υγεία", "Επιχείρηση", "Ελλάδα", "Ψυχαγωγία", # "Ελλάδα", "Τεχνολογία", "Σπορ"]