Beispiel #1
0
def main(opts):

    line_pairs, vocab_size, idx_dict = load_data()
    print_data_stats(line_pairs, vocab_size, idx_dict)

    # Train Test Split
    num_lines = len(line_pairs)
    num_train = int(0.8 * num_lines)
    train_pairs, test_pairs = line_pairs[:num_train], line_pairs[num_train:]
    line_pairs = train_pairs

    # Split the line pairs into an 80% train and 20% val split
    num_lines = len(line_pairs)
    num_train = int(0.8 * num_lines)
    train_pairs, val_pairs = line_pairs[:num_train], line_pairs[num_train:]

    # Group the data by the lengths of the source and target words, to form batches
    train_dict = create_dict(train_pairs)
    val_dict = create_dict(val_pairs)
    test_dict = create_dict(test_pairs)

    ##########################################################################
    ### Setup: Create Encoder, Decoder, Learning Criterion, and Optimizers ###
    ##########################################################################
    encoder = models.GRUEncoder(vocab_size=vocab_size,
                                hidden_size=opts.hidden_size,
                                opts=opts)

    if opts.no_attention:
        decoder = models.NoAttentionDecoder(vocab_size=vocab_size,
                                            hidden_size=opts.hidden_size)
    else:
        decoder = models.AttentionDecoder(vocab_size=vocab_size,
                                          hidden_size=opts.hidden_size)

    if opts.cuda:
        encoder.cuda()
        decoder.cuda()
        print("Moved models to GPU!")

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(encoder.parameters()) +
                           list(decoder.parameters()),
                           lr=opts.learning_rate)

    try:
        training_loop(train_dict, val_dict, idx_dict, encoder, decoder,
                      criterion, optimizer, opts)
        # Evaluation on 20% Test Data
        print("\nEvaluation on 20% Test Data")
        test_loss = evaluate(test_dict, encoder, decoder, idx_dict, criterion,
                             opts)
        print(f"Test loss {test_loss}")
    except KeyboardInterrupt:
        print('Exiting early from training.')
Beispiel #2
0
    def eval(self, train_pos, train_neg, test_pos, test_neg, embeddings_file=None, checkpoint_file=None):
        phrase_dic = clean_dictionary(pickle.load(open(config.phrase_dic, 'rb')))
        if self.model_type == 'rnn':
            if use_cuda:
                print('GPU available!!')
                device = torch.device('cuda')
            else:
                device = torch.device('cpu')

            modelcheckpoint = torch.load(checkpoint_file, map_location=device)
            vocabulary_size = len(modelcheckpoint['word2idx'])
            model = models.GRUEncoder(vocabulary_size,
                                      self.embedding_dim,
                                      self.rnn_size,
                                      self.neg_sample_num,
                                      self.batch_size,
                                      self.window_size)
            print_params(model)
            #
            if use_cuda:
                print('GPU available!!')
                model.cuda()
            #
            model.eval()
            model.load_state_dict(modelcheckpoint['state_dict'])
            #
            print('Number of positive training samples: ', len(train_pos))
            print('Number of negative training samples: ', len(train_neg))
            print('Number of positive testing samples: ', len(test_pos))
            print('Number of negative testing samples: ', len(test_neg))
            word2idx = modelcheckpoint['word2idx']
            node_embeddings = self.create_node_embeddings(model, phrase_dic, word2idx)

        else:
            node_embeddings = load_embeddings(embeddings_file)

        if config.evaluate_cosine:
            # first calculate the cosine similarity for every edge in test_pos and in test_neg
            cosine_test_pos = get_cos_embedding(test_pos, node_embeddings, phrase_dic)
            cosine_test_neg = get_cos_embedding(test_neg, node_embeddings, phrase_dic)

            # turn negative values to zeros
            cosine_test_pos[cosine_test_pos < 0] = 0
            cosine_test_neg[cosine_test_neg < 0] = 0

            # the predictions are the cosine similarities and we also create the labels.
            test_preds = np.concatenate([cosine_test_pos, cosine_test_neg])
            test_labels = np.zeros(test_preds.shape[0])
            test_labels[:cosine_test_pos.shape[0]] = 1

            test_auc = roc_auc_score(test_labels, test_preds)
            print('node2vec Test AUC score: ', str(test_auc))

        if config.evaluate_lr:
            test_neg = pickle.load(open(config.test_neg, 'rb'))
            train_pos_edge_embs = get_edge_embeddings(train_pos, node_embeddings, self.model_type, phrase_dic)
            train_neg_edge_embs = get_edge_embeddings(train_neg, node_embeddings, self.model_type, phrase_dic)
            train_set = np.concatenate([train_pos_edge_embs, train_neg_edge_embs])

            # labels: 1-> link exists, 0-> false edge
            train_labels = np.zeros(len(train_set))
            train_labels[:len(train_pos_edge_embs)] = 1

            # for testing
            test_pos_edge_embs = get_edge_embeddings(test_pos, node_embeddings, self.model_type, phrase_dic)
            test_neg_edge_embs = get_edge_embeddings(test_neg, node_embeddings, self.model_type, phrase_dic)
            test_set = np.concatenate([test_pos_edge_embs, test_neg_edge_embs])

            # labels: 1-> link exists, 0-> false edge
            test_labels = np.zeros(len(test_set))
            test_labels[:len(test_pos_edge_embs)] = 1

            # train the classifier and evaluate in the test set
            # shuffle train set
            idx_list = [i for i in range(len(train_labels))]
            shuffle(idx_list)
            train_set = train_set[idx_list]
            train_labels = train_labels[idx_list]

            # shuffle test set
            idx_list = [i for i in range(len(test_labels))]
            shuffle(idx_list)
            test_set = test_set[idx_list]
            test_labels = test_labels[idx_list]

            classifier = LogisticRegression()
            classifier.fit(train_set, train_labels)

            # evaluate
            test_preds = classifier.predict_proba(test_set)
            false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, test_preds[:, 1])
            average_precision = average_precision_score(test_labels, test_preds[:, 1])
            test_auc = auc(false_positive_rate, true_positive_rate)
            test_roc = roc_auc_score(test_labels, test_preds[:, 1])
            print('node2vec Test ROC score: ', str(test_roc))
            print('node2vec Test AUC score: ', str(test_auc))
            print('node2vec Test AP score: ', str(average_precision))
Beispiel #3
0
    def train(self):
        # initialize the model
        if self.model_type == 'rnn':
            model = models.GRUEncoder(self.vocabulary_size,
                                      self.embedding_dim,
                                      self.rnn_size,
                                      self.neg_sample_num,
                                      self.batch_size,
                                      self.window_size)
        else:
            model = models.AverageNode2Vec(self.vocabulary_size,
                                           self.embedding_dim,
                                           self.neg_sample_num,
                                           self.batch_size,
                                           self.window_size)
        print_params(model)
        params = model.parameters()
        if use_cuda:
            print('GPU available!!')
            model.cuda()

        if self.model_type == 'rnn':
            optimizer = optim.Adam(params, lr=config.lr)
        else:
            optimizer = optim.SparseAdam(params, lr=config.lr)

        dataset = Node2VecDataset(self.utils, self.neg_sample_num)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=self.batch_size,
                                shuffle=False)
        
        model.train()
        for epoch in range(self.epochs):
            batch_num = 0
            batch_costs = []
            last_batch_num = -1
            # if we resume training load the last checkpoint
            if config.resume_training:
                if use_cuda:
                    print('GPU available..will resume training!!')
                    device = torch.device('cuda')
                else:
                    device = torch.device('cpu')

                modelcheckpoint = torch.load(os.path.join(config.checkpoint_dir, config.checkpoint_to_load),
                                             map_location=device)
                model.load_state_dict(modelcheckpoint['state_dict'])
                optimizer.load_state_dict(modelcheckpoint['optimizer'])
                last_batch_num = modelcheckpoint['batch_num']
                self.word2idx = modelcheckpoint['word2idx']
                # last_loss = modelcheckpoint['loss']
                print("We stopped in {} batch".format(last_batch_num))
            #
            iterator = tqdm(dataloader)
            for sample in iterator:

                # if we resume training--continue from the last batch we stopped
                if batch_num <= last_batch_num:
                    batch_num += 1
                    continue

                ###-----------
                phr = sample['center']
                pos_context = sample['context']
                neg_v = np.random.choice(self.utils.sample_table, size=(len(phr) * self.neg_sample_num)).tolist()
                ###-----------

                # -----------
                phr = [phr2idx(self.utils.phrase_dic[phr_id.item()], self.word2idx) for phr_id in phr]
                pos_context = [phr2idx(self.utils.phrase_dic[item.item()], self.word2idx) for item in pos_context]
                neg_v = [phr2idx(self.utils.phrase_dic[item], self.word2idx) for item in neg_v]
                # -----------

                # --------------
                optimizer.zero_grad()
                loss = model(phr, pos_context, neg_v)
                loss.backward()
                optimizer.step()
                batch_costs.append(loss.cpu().item())
                # --------------

                # print the average cost every 5000 batches
                if batch_num % 5000 == 0:
                    print('Batches Average Loss: {}, Batches: {} '.format(
                        sum(batch_costs) / float(len(batch_costs)),
                        batch_num))
                    batch_costs = []

                # save the model every 300000 batches
                if batch_num % 300000 == 0:
                    print("Saving at {} batches".format(batch_num))
                    state = {'epoch': epoch + 1,
                             'state_dict': model.state_dict(),
                             'optimizer': optimizer.state_dict(),
                             'word2idx': self.word2idx,
                             'idx2word': self.utils.idx2word,
                             'batch_num': batch_num,
                             'loss': loss.cpu().item()}

                    save_checkpoint(state,
                                    filename=self.odir_checkpoint + '{}_checkpoint_batch_{}.pth.tar'.format(
                                        config.dataset_name,
                                        batch_num))
                ###
                batch_num += 1

            # reset the yielder on the dataset class
            if epoch + 1 != self.epochs:
                dataset.reset_generator()

            # save the model on each epoch
            state = {'epoch': epoch + 1,
                     'state_dict': model.state_dict(),
                     'optimizer': optimizer.state_dict(),
                     'word2idx': self.word2idx,
                     'idx2word': self.utils.idx2word}

            save_checkpoint(state, filename=self.odir_checkpoint + config.checkpoint_name.format(epoch + 1))
            # TODO do something better here
            config.checkpoint_name = config.checkpoint_name.format(epoch + 1)

        # training has finished..save the word embeddings
        print("Optimization Finished!")
        self.wv = model.save_embeddings(file_name=self.odir_embeddings + self.output_file,
                                        idx2word=self.utils.idx2word,
                                        use_cuda=True)