def get_cluster_quality(): """Returns cluster quality. """ print('Getting vocabulary ...') data_file = os.path.join(args.data_path, 'min_df_{}'.format(args.min_df)) vocab, cluster_valid = data.get_all_data(data_file, temporal=True) vocab_size = len(vocab) topics_distributions = [] # get data print('Getting full data ...') tokens = train['tokens'] counts = train['counts'] times = train['times'] num_times = len(np.unique(train_times)) num_docs = len(tokens) rnn_inp = data.get_rnn_input(tokens, counts, times, num_times, vocab_size, num_docs) model.eval() with torch.no_grad(): indices = torch.split(torch.tensor(range(num_docs)), args.eval_batch_size) eta = get_eta(rnn_inp) acc_loss = 0 cnt = 0 for idx, ind in enumerate(indices): data_batch, times_batch = data.get_batch( tokens, counts, ind, vocab_size, args.emb_size, temporal=True, times=times) sums = data_batch.sum(1).unsqueeze(1) if args.bow_norm: normalized_data_batch = data_batch / sums else: normalized_data_batch = data_batch eta_td = eta[times_batch.type('torch.LongTensor')] theta = get_theta(eta_td, normalized_data_batch) print('\n') print('Get topic coherence...') print('train_tokens: ', train_tokens[0]) TC_all = [] cnt_all = [] for tt in range(args.num_times): tc, cnt = get_topic_coherence(beta[:, tt, :].detach().numpy(), train_tokens, vocab) TC_all.append(tc) cnt_all.append(cnt) print('TC_all: ', TC_all) TC_all = torch.tensor(TC_all) print('TC_all: ', TC_all.size()) print('\n') print('Get topic quality...') quality = tc * diversity print('Topic Quality is: {}'.format(quality)) print('#'*100)
def get_cluster_quality(): """Returns cluster quality. """ topics_distributions = [] #df = pd.read_csv("/home/beck/Repositories/Data/trec2011_microblog/trec2011_2012_final.tsv", sep="\t") cluster_ids = [] with open(os.path.join(data_file, 'topic_ids.txt'), 'r') as fp: for line in fp: cluster_ids.append(line.strip()) # same number of docs as cluster ids assert len(cluster_ids) == len(tokens) rnn_inp = data.get_rnn_input(tokens, counts, times, num_times, vocab_size, num_docs) model.eval() with torch.no_grad(): indices = torch.split(torch.tensor(range(num_docs)), args.eval_batch_size) eta = get_eta(rnn_inp) for idx, ind in enumerate(indices): data_batch, times_batch = data.get_batch(tokens, counts, ind, vocab_size, args.emb_size, temporal=True, times=times) sums = data_batch.sum(1).unsqueeze(1) if args.bow_norm: normalized_data_batch = data_batch / sums else: normalized_data_batch = data_batch eta_td = eta[times_batch.type('torch.LongTensor')] theta = get_theta(eta_td, normalized_data_batch) topics_distributions += torch.argmax(theta, 1).tolist() for (name, fun) in { 'NMI': NMI, 'Hom': Hom, 'Com': Com, 'VM': VM, 'Acc': Acc }.items(): print(name, fun(cluster_ids, topics_distributions))
## get data # 1. vocabulary print('Getting vocabulary ...') data_file = os.path.join(args.data_path, 'min_df_{}'.format(args.min_df)) vocab, train, valid, test = data.get_data(data_file, temporal=True) vocab_size = len(vocab) args.vocab_size = vocab_size # 1. training data print('Getting training data ...') train_tokens = train['tokens'] train_counts = train['counts'] train_times = train['times'] args.num_times = len(np.unique(train_times)) args.num_docs_train = len(train_tokens) train_rnn_inp = data.get_rnn_input( train_tokens, train_counts, train_times, args.num_times, args.vocab_size, args.num_docs_train) # 2. dev set print('Getting validation data ...') valid_tokens = valid['tokens'] valid_counts = valid['counts'] valid_times = valid['times'] args.num_docs_valid = len(valid_tokens) valid_rnn_inp = data.get_rnn_input( valid_tokens, valid_counts, valid_times, args.num_times, args.vocab_size, args.num_docs_valid) # 3. test data print('Getting testing data ...') test_tokens = test['tokens'] test_counts = test['counts'] test_times = test['times']
## get data # 1. vocabulary print('Getting vocabulary ...') data_file = os.path.join(args.data_path, 'min_df_{}'.format(args.min_df)) vocab, train_data, validation_data, test_1_data, test_2_data, test_data = data.get_data( ) vocab_size = len(vocab) args.vocab_size = vocab_size # 1. training data print('Getting training data ...') _, train_times = data.get_time_columns(train_data) args.num_times = len(np.unique(train_times)) print("the number of uniques train times are ", args.num_times) train_rnn_inp = data.get_rnn_input(train_data, args.num_times, args.vocab_size, "train") # should save the input here and load it if required # 2. dev set print('Getting validation data ...') valid_rnn_inp = data.get_rnn_input(validation_data, args.num_times, args.vocab_size, "valid") # 3. test data print('Getting testing data ...') test_rnn_inp = data.get_rnn_input(test_data, args.num_times, args.vocab_size, "test") test_1_rnn_inp = data.get_rnn_input(test_1_data, args.num_times, args.vocab_size, "test_1") test_2_rnn_inp = data.get_rnn_input(test_2_data, args.num_times, args.vocab_size, "test_2")