Esempio n. 1
0
def get_cluster_quality():
    """Returns cluster quality.
    """

    print('Getting vocabulary ...')
    data_file = os.path.join(args.data_path, 'min_df_{}'.format(args.min_df))
    vocab, cluster_valid = data.get_all_data(data_file, temporal=True)
    vocab_size = len(vocab)
    topics_distributions = []

    # get data
    print('Getting full data ...')
    tokens = train['tokens']
    counts = train['counts']
    times = train['times']
    num_times = len(np.unique(train_times))
    num_docs = len(tokens)
    rnn_inp = data.get_rnn_input(tokens, counts, times, num_times, vocab_size, num_docs)
    model.eval()
    with torch.no_grad():
        indices = torch.split(torch.tensor(range(num_docs)), args.eval_batch_size)

        eta = get_eta(rnn_inp)

        acc_loss = 0
        cnt = 0
        for idx, ind in enumerate(indices):
            data_batch, times_batch = data.get_batch(
                tokens, counts, ind, vocab_size, args.emb_size, temporal=True, times=times)
            sums = data_batch.sum(1).unsqueeze(1)
            if args.bow_norm:
                normalized_data_batch = data_batch / sums
            else:
                normalized_data_batch = data_batch

            eta_td = eta[times_batch.type('torch.LongTensor')]
            theta = get_theta(eta_td, normalized_data_batch)


        print('\n')
        print('Get topic coherence...')
        print('train_tokens: ', train_tokens[0])
        TC_all = []
        cnt_all = []
        for tt in range(args.num_times):
            tc, cnt = get_topic_coherence(beta[:, tt, :].detach().numpy(), train_tokens, vocab)
            TC_all.append(tc)
            cnt_all.append(cnt)
        print('TC_all: ', TC_all)
        TC_all = torch.tensor(TC_all)
        print('TC_all: ', TC_all.size())
        print('\n')
        print('Get topic quality...')
        quality = tc * diversity
        print('Topic Quality is: {}'.format(quality))
        print('#'*100)
Esempio n. 2
0
def get_cluster_quality():
    """Returns cluster quality.
    """
    topics_distributions = []
    #df = pd.read_csv("/home/beck/Repositories/Data/trec2011_microblog/trec2011_2012_final.tsv", sep="\t")
    cluster_ids = []
    with open(os.path.join(data_file, 'topic_ids.txt'), 'r') as fp:
        for line in fp:
            cluster_ids.append(line.strip())

    # same number of docs as cluster ids
    assert len(cluster_ids) == len(tokens)

    rnn_inp = data.get_rnn_input(tokens, counts, times, num_times, vocab_size,
                                 num_docs)
    model.eval()
    with torch.no_grad():
        indices = torch.split(torch.tensor(range(num_docs)),
                              args.eval_batch_size)

        eta = get_eta(rnn_inp)

        for idx, ind in enumerate(indices):
            data_batch, times_batch = data.get_batch(tokens,
                                                     counts,
                                                     ind,
                                                     vocab_size,
                                                     args.emb_size,
                                                     temporal=True,
                                                     times=times)
            sums = data_batch.sum(1).unsqueeze(1)
            if args.bow_norm:
                normalized_data_batch = data_batch / sums
            else:
                normalized_data_batch = data_batch

            eta_td = eta[times_batch.type('torch.LongTensor')]
            theta = get_theta(eta_td, normalized_data_batch)
            topics_distributions += torch.argmax(theta, 1).tolist()

    for (name, fun) in {
            'NMI': NMI,
            'Hom': Hom,
            'Com': Com,
            'VM': VM,
            'Acc': Acc
    }.items():
        print(name, fun(cluster_ids, topics_distributions))
Esempio n. 3
0
## get data
# 1. vocabulary
print('Getting vocabulary ...')
data_file = os.path.join(args.data_path, 'min_df_{}'.format(args.min_df))
vocab, train, valid, test = data.get_data(data_file, temporal=True)
vocab_size = len(vocab)
args.vocab_size = vocab_size

# 1. training data
print('Getting training data ...')
train_tokens = train['tokens']
train_counts = train['counts']
train_times = train['times']
args.num_times = len(np.unique(train_times))
args.num_docs_train = len(train_tokens)
train_rnn_inp = data.get_rnn_input(
    train_tokens, train_counts, train_times, args.num_times, args.vocab_size, args.num_docs_train)

# 2. dev set
print('Getting validation data ...')
valid_tokens = valid['tokens']
valid_counts = valid['counts']
valid_times = valid['times']
args.num_docs_valid = len(valid_tokens)
valid_rnn_inp = data.get_rnn_input(
    valid_tokens, valid_counts, valid_times, args.num_times, args.vocab_size, args.num_docs_valid)

# 3. test data
print('Getting testing data ...')
test_tokens = test['tokens']
test_counts = test['counts']
test_times = test['times']
Esempio n. 4
0
## get data
# 1. vocabulary
print('Getting vocabulary ...')
data_file = os.path.join(args.data_path, 'min_df_{}'.format(args.min_df))
vocab, train_data, validation_data, test_1_data, test_2_data, test_data = data.get_data(
)
vocab_size = len(vocab)
args.vocab_size = vocab_size

# 1. training data
print('Getting training data ...')

_, train_times = data.get_time_columns(train_data)
args.num_times = len(np.unique(train_times))
print("the number of uniques train times are ", args.num_times)
train_rnn_inp = data.get_rnn_input(train_data, args.num_times, args.vocab_size,
                                   "train")
# should save the input here and load it if required
# 2. dev set
print('Getting validation data ...')
valid_rnn_inp = data.get_rnn_input(validation_data, args.num_times,
                                   args.vocab_size, "valid")

# 3. test data
print('Getting testing data ...')

test_rnn_inp = data.get_rnn_input(test_data, args.num_times, args.vocab_size,
                                  "test")
test_1_rnn_inp = data.get_rnn_input(test_1_data, args.num_times,
                                    args.vocab_size, "test_1")
test_2_rnn_inp = data.get_rnn_input(test_2_data, args.num_times,
                                    args.vocab_size, "test_2")