Exemple #1
0
def evaluate_model(evalparams):

    torch.manual_seed(evalparams.seed)
    random.seed(1234)
    if evalparams.cpu:
        evalparams.cuda = False
    elif evalparams.cud:
        torch.cuda.manual_seed(args.seed)

    # load opt
    print(evalparams.model_dir, evalparams.model)
    #     model_file = evalparams.model_dir + "/" + evalparams.model
    model_file = 'best_model.pt'
    print("Loading model from {}".format(model_file))
    opt = torch_utils.load_config(model_file)
    model = RelationModel(opt)
    model.load(model_file)

    # load vocab
    vocab_file = evalparams.model_dir + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    assert opt[
        'vocab_size'] == vocab.size, "Vocab size must match that in the saved model."

    # load data
    data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset)
    print("Loading data from {} with batch size {}...".format(
        data_file, opt['batch_size']))
    batch = DataLoader(data_file,
                       opt['batch_size'],
                       opt,
                       vocab,
                       evaluation=True)

    helper.print_config(opt)
    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])

    predictions = []
    all_probs = []
    for i, b in enumerate(batch):
        preds, probs, _ = model.predict(b)
        predictions += preds
        all_probs += probs
    predictions = [id2label[p] for p in predictions]
    p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True)

    # save probability scores
    if len(evalparams.out) > 0:
        helper.ensure_dir(os.path.dirname(evalparams.out))
        with open(evalparams.out, 'wb') as outfile:
            pickle.dump(all_probs, outfile)
        print("Prediction scores saved to {}.".format(evalparams.out))

    print("Evaluation ended.")

    return (batch.gold(), predictions, model)
Exemple #2
0
def load_best_model(model_dir, model_type="predictor"):
    model_file = model_dir + "/best_model.pt"
    print("Loading model from {}".format(model_file))
    model_opt = torch_utils.load_config(model_file)
    if model_type == "predictor":
        predictor = Predictor(model_opt)
        model = Trainer(model_opt, predictor, model_type=model_type)
    else:
        selector = Selector(model_opt)
        model = Trainer(model_opt, selector, model_type=model_type)
    model.load(model_file)
    helper.print_config(model_opt)
    return model
Exemple #3
0
                       evaluation=True)

model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
model_save_dir = opt['save_dir'] + '/' + model_id
opt['model_save_dir'] = model_save_dir
helper.ensure_dir(model_save_dir, verbose=True)

# save config
helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
vocab.save(model_save_dir + '/vocab.pkl')
file_logger = helper.FileLogger(
    model_save_dir + '/' + opt['log'],
    header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score")

# print model info
helper.print_config(opt)

# model
if not opt['load']:
    trainer = GCNTrainer(opt, emb_matrix=emb_matrix)
else:
    # load pretrained model
    model_file = opt['model_file']
    print("Loading model from {}".format(model_file))
    model_opt = torch_utils.load_config(model_file)
    model_opt['optim'] = opt['optim']
    trainer = GCNTrainer(model_opt)
    trainer.load(model_file)

id2label = dict([(v, k) for k, v in label2id.items()])
dev_score_history = []
Exemple #4
0
def transre_search(ffn, connect, hidden_dim, trans_layers, multi_heads,
                   ffn_ex_size, initial, final):
    opt['weighted'] = False
    opt['rnn'] = False
    opt['ffn'] = ffn
    opt['connect'] = connect
    opt['hidden_dim'] = hidden_dim
    opt['trans_layers'] = trans_layers
    opt['multi_heads'] = multi_heads
    opt['ffn_ex_size'] = ffn_ex_size
    opt['initial'] = initial
    opt['final'] = final

    id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_name =str (opt['optim']) + '_' + str (opt['lr']) + str (ffn) + '_' +str(connect)+"_"\
                + str (hidden_dim) + '_' + str (trans_layers) + '_' + str (multi_heads) + '_' + \
                str (ffn_ex_size)+'_'+str(initial)+'_'+str(final)
    model_name = model_name + '' + str(opt['memo'])

    model_name = str(id) + "_" + model_name

    model_save_dir = opt['save_dir'] + '/' + model_name
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
    vocab.save(model_save_dir + '/vocab.pkl')
    file_logger = helper.FileLogger(
        model_save_dir + '/' + opt['log'],
        header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score")
    helper.print_config(opt)

    if not opt['load']:
        trainer = TransTrainer(opt, emb_matrix=emb_matrix)
    else:
        # load pre-train model
        model_file = opt['model_file']
        print("Loading model from {}".format(model_file))
        model_opt = torch_utils.load_config(model_file)
        model_opt['optim'] = opt['optim']
        trainer = TransTrainer(model_opt)
        trainer.load(model_file)

    id2label = dict([(v, k) for k, v in label2id.items()
                     ])  # the classification result
    dev_score_history = []
    dev_loss_history = []
    current_lr = opt['lr']

    global_step = 0
    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    best_result = "unknown"
    file_logger.log(str(opt['memo']))
    for epoch in range(1, opt['num_epoch'] + 1):
        train_loss = 0
        epoch_start_time = time.time()
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss, norm = trainer.update(batch)
            train_loss += loss
            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(
                    format_str.format(datetime.now(), global_step, max_steps,
                                      epoch, opt['num_epoch'], loss, duration,
                                      current_lr))

        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss, _ = trainer.predict(batch)
            predictions += preds
            dev_loss += loss
        predictions = [id2label[p] for p in predictions]
        train_loss = train_loss / train_batch.num_examples * opt[
            'batch_size']  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']

        acc, dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)
        print(
            "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}"
            .format(epoch, train_loss, dev_loss, dev_f1))
        dev_score = dev_f1
        file_logger.log("{}\t{:.3f}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format(
            epoch, acc, train_loss, dev_loss, dev_score,
            max([dev_score] + dev_score_history)))

        # save
        model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
        trainer.save(model_file, epoch)

        if epoch == 1 or dev_score > max(dev_score_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            best_result = (model_name, dev_score)
            print("new best model saved.")
            file_logger.log(
                "new best model saved at epoch {}: {:.2f}\t{:.2f}\t{:.2f}".
                format(epoch, dev_p * 100, dev_r * 100, dev_score * 100))
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        # lr schedule
        if len(dev_score_history
               ) > opt['decay_epoch'] and dev_score <= dev_score_history[
                   -1] and opt['optim'] in ['sgd', 'adagrad', 'adadelta']:
            current_lr *= opt['lr_decay']
            trainer.update_lr(current_lr)

        dev_score_history += [dev_score]
        dev_loss_history += [dev_loss]
        epoch_end_time = time.time()
        print("epoch time {:.3f}".format(epoch_end_time - epoch_start_time))
    return best_result
Exemple #5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='data/')
    parser.add_argument('--save_dir', type=str, default='saved_models')

    # Model parameters
    parser.add_argument('--emb_dim', type=int, default=50, help='Word embedding dimension.')
    parser.add_argument('--pos_dim', type=int, default=5, help='Position embedding dimension.')
    parser.add_argument('--pos_limit', type=int, default=30, help='Position embedding length limit.')
    parser.add_argument('--num_conv', type=int, default=230, help='The number of convolutional filters.')
    parser.add_argument('--win_size', type=int, default=3, help='Convolutional filter size.')
    parser.add_argument('--dropout', type=float, default=0.5, help='The rate at which randomly set a parameter to 0.')
    parser.add_argument('--lr', type=float, default=0.01, help='Applies to SGD.')
    parser.add_argument('--num_epoch', type=int, default=15)
    parser.add_argument('--num_rand_start', type=int, default=30)
    parser.add_argument('--penal_scalar', type=int, default=500)
    
    parser.add_argument('--adaplr', dest='adaplr', action='store_true', help='Use bag-size adaptive learning rate.')
    parser.add_argument('--no-adaplr', dest='adaplr', action='store_false')
    parser.set_defaults(adaplr=True)
    parser.add_argument('--adaplr_beta1', type=float, default=20.0)
    parser.add_argument('--adaplr_beta2', type=float, default=25.0)
    
    parser.add_argument('--sen_file', type=str, default='sentential_DEV.txt', help='Sentential eval dataset.')
    parser.add_argument('--heldout_eval', type=bool, default=False, help='Perform heldout evaluation after each epoch.')
    parser.add_argument('--save_each_epoch', type=bool, default=False, help='Save the checkpoint of each epoch.')

    # parser.add_argument('--seed', type=int, default=666)
    parser.add_argument('--trial_exp', dest='trial', action='store_true', help='Use partial training data.')
    parser.set_defaults(trial=False)
    parser.add_argument('--num_trial', type=int, default=10000)
    parser.add_argument('--log_step', type=int, default=20000)
    parser.add_argument('--num_exp', type=int, default=0)

    parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
    parser.add_argument('--cpu', action='store_true', help='Ignore CUDA.')
    args = parser.parse_args()
    
    if args.cpu:
        args.cuda = False
    
#     # Set random seed
#     torch.manual_seed(args.seed)
#     np.random.seed(args.seed)
#     random.seed(args.seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
    
#     if args.cuda:
#         torch.cuda.manual_seed(args.seed)
     
        
    # make opt
    opt = vars(args)

    opt['train_file'] = opt['data_dir'] + '/' + 'train.txt'
    opt['test_file'] = opt['data_dir'] + '/' + 'test.txt'
    opt['sen_dev_file'] = opt['data_dir'] + '/' + 'sentential_DEV.txt'
    opt['vocab_file'] = opt['data_dir'] + '/' + 'vec.bin'
    opt['rel_file'] = opt['data_dir'] + '/' + 'relation2id.txt'
    if opt['data_dir'].split('/')[-1] != '':
        opt['data_name'] = opt['data_dir'].split('/')[-1]
    else:
        opt['data_name'] = opt['data_dir'].split('/')[-2]


    # Pretrained word embedding
    print "\nPretrained word embedding loaded"
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(opt['vocab_file'], binary=True)
    word_list = [u'UNK'] + w2v_model.index2word
    word_vec = w2v_model.syn0

    word2id = {}

    for id, word in enumerate(word_list):
        word2id[word] = id

    assert opt['emb_dim'] == w2v_model.syn0.shape[1]


    # Read from relation2id.txt to build a dictionary: rel2id
    rel2id = {}
            
    with open(opt['rel_file'],'rb') as f:
        for item in f:
            [relation, id] = item.strip('\n').split(' ')
            rel2id[relation] = int(id)
            
    id2rel = [''] * len(rel2id)
    for relation, rel_id in rel2id.items():
        id2rel[rel_id] = relation

    opt['num_rel'] = len(rel2id)
    opt['vocab_size'] = len(word_list)


    # Load data
    all_data = loader.DataLoader(opt, word2id, rel2id)
    opt['pos_e1_size'] = all_data.pos_max_e1 - all_data.pos_min_e1 + 1
    opt['pos_e2_size'] = all_data.pos_max_e2 - all_data.pos_min_e2 + 1
    opt['pos_min_e1'] = all_data.pos_min_e1
    opt['pos_min_e2'] = all_data.pos_min_e2
    opt['EP_num_train'] = len(all_data.bags_train)
    opt['EP_num_test'] = len(all_data.bags_test)
    

    assert opt['pos_e1_size'] == opt['pos_e2_size']

    helper.check_dir(opt['save_dir'])
    helper.print_config(opt)
    
    
    # Get KB disagreement penalty
    kb_score_all = kb_info.get_MIT_MID_score(all_data.bags_train, all_data.train_bags_label, opt, rel2id, id2rel)
    
    
    # Get hamming score
    ham_score_all = kb_info.getting_hamming_score(all_data.bags_train, all_data.train_bags_label, opt)
    
    
    # Build the model
    PCNN_NMAR_model = PCNN_NMAR(word_vec, opt)
    
    if opt['cuda']:
        PCNN_NMAR_model.cuda()

    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(PCNN_NMAR_model.parameters(), lr=opt['lr'])
    
    print "Training starts."

    for epoch in xrange(opt['num_epoch']):
        
        opt['epoch'] = epoch
        
        start_time = time.time()
        
        total_loss = np.float64(0.0)
        
        train_part = all_data.bags_train.keys()[:]
            
        if opt['trial']:
            train_part = train_part[:opt['num_trial']]
            
        random.shuffle(train_part) 
        
        
        for index, bag_name in enumerate(train_part):
            
            if index > 0 and index % opt['log_step'] == 0:
                print '{}: train examples {}/{} (epoch {}/{}), loss = {:.6f} '.format(datetime.now(), index, opt['EP_num_train'], epoch+1, opt['num_epoch'], total_loss)
                
            optimizer.zero_grad()
                
            sentence_list = all_data.bags_train[bag_name]
            target = all_data.train_bags_label[bag_name]
            kb_score = kb_score_all[bag_name]
            ham_score = ham_score_all[bag_name]

            BPable_loss, loss_augmented = PCNN_NMAR_model(sentence_list, target, all_data, kb_score, ham_score)
            
            # Check if there is search error
            assert loss_augmented >= 0
            
            total_loss += loss_augmented
           
            
            # Apply bag-size adaptive learning rate
            if opt['adaplr']:
                if len(sentence_list) <= opt['adaplr_beta1']:
                    adaplr_scalar = 1
                elif len(sentence_list) <= opt['adaplr_beta2']:
                    adaplr_scalar = (float(opt['adaplr_beta1']) / len(sentence_list))
                else:
                    adaplr_scalar = (float(opt['adaplr_beta1']) / len(sentence_list)) ** 2
                    
                BPable_loss = BPable_loss * adaplr_scalar

            BPable_loss.backward()
            optimizer.step()

            
            
        stop_time = time.time()  
        print 'For epoch {}/{}, training time:{}, training loss: {:.6f}'.format(epoch+1, opt['num_epoch'], stop_time - start_time, total_loss)
        

            
        # Sentential evaluation
        sen_AUC = PCNN_NMAR_model.sentential_eval(opt['sen_dev_file'], all_data, rel2id, id2rel)
        print 'The sentential AUC of P/R curve on DEV set: {:.3f}'.format(sen_AUC)
        
        
        # Heldout evaluation
        if opt['heldout_eval']:
            recall, precision = PCNN_NMAR_model.heldout_eval(all_data)
            heldout_AUC = metrics.auc(recall, precision) if len(recall) != 0 else 0
            print "The heldout AUC of P/R curve: {:.4f}".format(heldout_AUC)
        
        
        # Save parameters in each epoch
        model_file = opt['save_dir'] + '/' + opt['data_name'] + '_' + \
                    'lr{}_penal{}_epoch{}.tar'.format(opt['lr'], opt['penal_scalar'], epoch)
        # print model_file
        
        if opt['save_each_epoch']:
            torch.save({
                'state_dict': PCNN_NMAR_model.state_dict(),
                'config': opt
            }, model_file )
            
        
        best_file = opt['save_dir'] + '/' + opt['data_name'] + '_' + \
                    'lr{}_penal{}_best_model.tar'.format(opt['lr'], opt['penal_scalar'])
        
        if epoch == 0 or best_AUC < sen_AUC:
            
            best_AUC = sen_AUC
            
            torch.save({
                'state_dict': PCNN_NMAR_model.state_dict(),
                'config': opt
            }, best_file )
Exemple #6
0
def train_unbiased_model(args, biased_batch_probs):
    # make opt
    opt = vars(args)
    opt["num_class"] = len(constant.LABEL_TO_ID)

    # load vocab
    vocab_file = opt['vocab_dir'] + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    opt['vocab_size'] = vocab.size
    emb_file = opt['vocab_dir'] + '/embedding.npy'
    emb_matrix = np.load(emb_file)
    assert emb_matrix.shape[0] == vocab.size
    assert emb_matrix.shape[1] == opt['emb_dim']

    # load data
    print("Loading data from {} with batch size {}...".format(
        opt["data_dir"], opt["batch_size"]))
    train_batch = DataLoader(
        opt["data_dir"] + "/" + args.data_name,
        opt["batch_size"],
        opt,
        vocab,
        evaluation=False,
    )
    dev_batch = DataLoader(opt["data_dir"] + "/dev.json",
                           opt["batch_size"],
                           opt,
                           vocab,
                           evaluation=True)

    model_id = opt["id"] if len(opt["id"]) > 1 else "0" + opt["id"]
    model_save_dir = opt["save_dir"] + "/" + model_id
    opt["model_save_dir"] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + "/config.json", verbose=True)
    vocab.save(model_save_dir + "/vocab.pkl")
    file_logger = helper.FileLogger(
        model_save_dir + "/" + opt["log"],
        header="# epoch\ttrain_loss\tdev_loss\tdev_f1")

    # print model info
    helper.print_config(opt)

    # model
    model = RelationModel(opt, emb_matrix=emb_matrix)

    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
    dev_f1_history = []
    current_lr = opt["lr"]

    global_step = 0
    global_start_time = time.time()
    format_str = (
        "{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}"
    )
    max_steps = len(train_batch) * opt["num_epoch"]

    # start training
    for epoch in range(1, opt["num_epoch"] + 1):
        train_loss = 0
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = model.update(batch,
                                torch.tensor(biased_batch_probs[i]).cuda())
            train_loss += loss
            if global_step % opt["log_step"] == 0:
                duration = time.time() - start_time
                print(
                    format_str.format(
                        datetime.now(),
                        global_step,
                        max_steps,
                        epoch,
                        opt["num_epoch"],
                        loss,
                        duration,
                        current_lr,
                    ))

        # eval on dev
        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss = model.predict(batch)
            predictions += preds
            dev_loss += loss
        predictions = [id2label[p] for p in predictions]
        dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)

        f = open("label.txt", "w+")
        f.write(str(dev_batch.gold()))
        f.close()

        train_loss = (train_loss / train_batch.num_examples * opt["batch_size"]
                      )  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt["batch_size"]
        print(
            "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}"
            .format(epoch, train_loss, dev_loss, dev_f1))
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_f1))

        # save
        model_file = model_save_dir + "/checkpoint_epoch_{}.pt".format(epoch)
        model.save(model_file, epoch)
        if epoch == 1 or dev_f1 > max(dev_f1_history):
            copyfile(model_file, model_save_dir + "/best_model.pt")
            print("new best model saved.")
        if epoch % opt["save_epoch"] != 0:
            os.remove(model_file)

        # lr schedule
        if (len(dev_f1_history) > 10 and dev_f1 <= dev_f1_history[-1]
                and opt["optim"] in ["sgd", "adagrad"]):
            current_lr *= opt["lr_decay"]
            model.update_lr(current_lr)

        dev_f1_history += [dev_f1]
        print("")

    print("Training ended with {} epochs.".format(epoch))
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='data/')
    # parser.add_argument('--train_file', type=str, default='data/train.txt')
    # parser.add_argument('--test_file', type=str, default='data/test.txt')
    # parser.add_argument('--vocab_file', type=str, default='data/vec.bin')
    # parser.add_argument('--rel_file', type=str, default='data/relation2id.txt')
    parser.add_argument('--save_dir', type=str, default='saved_models')

    # Model parameters
    parser.add_argument('--emb_dim',
                        type=int,
                        default=50,
                        help='Word embedding dimension.')
    parser.add_argument('--pos_dim',
                        type=int,
                        default=5,
                        help='Position embedding dimension.')
    parser.add_argument('--pos_limit',
                        type=int,
                        default=30,
                        help='Position embedding length limit.')
    parser.add_argument('--num_conv',
                        type=int,
                        default=230,
                        help='The number of convolutional filters.')
    parser.add_argument('--win_size',
                        type=int,
                        default=3,
                        help='Convolutional filter size.')
    parser.add_argument(
        '--dropout',
        type=float,
        default=0.5,
        help='The rate at which randomly set a parameter to 0.')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='Applies to SGD.')
    parser.add_argument('--num_epoch', type=int, default=15)

    parser.add_argument('--num_trial', type=int, default=50000)
    parser.add_argument('--trial', type=bool, default=False)

    parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
    parser.add_argument('--cpu', action='store_true', help='Ignore CUDA.')
    args = parser.parse_args()

    if args.cpu:
        args.cuda = False

    # make opt
    opt = vars(args)

    opt['train_file'] = opt['data_dir'] + '/' + 'train.txt'
    opt['test_file'] = opt['data_dir'] + '/' + 'test.txt'
    opt['vocab_file'] = opt['data_dir'] + '/' + 'vec.bin'
    opt['rel_file'] = opt['data_dir'] + '/' + 'relation2id.txt'

    # Pretrained word embedding
    print "Load pretrained word embedding"
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
        opt['vocab_file'], binary=True)
    word_list = [u'UNK'] + w2v_model.index2word
    word_vec = w2v_model.syn0

    word_map = {}

    for id, word in enumerate(word_list):
        word_map[word] = id

    assert opt['emb_dim'] == w2v_model.syn0.shape[1]

    # Read from relation2id.txt to build a dictionary: rel_map
    rel_map = {}

    with open(opt['rel_file'], 'rb') as f:
        for item in f:
            [relation, id] = item.strip('\n').split(' ')
            rel_map[relation] = int(id)

    opt['num_rel'] = len(rel_map)
    opt['vocab_size'] = len(word_list)

    # Load data
    all_data = loader.DataLoader(opt['train_file'], opt['test_file'], opt,
                                 word_map, rel_map)
    opt['pos_e1_size'] = all_data.pos_max_e1 - all_data.pos_min_e1 + 1
    opt['pos_e2_size'] = all_data.pos_max_e2 - all_data.pos_min_e2 + 1
    opt['pos_min_e1'] = all_data.pos_min_e1
    opt['pos_min_e2'] = all_data.pos_min_e2

    assert opt['pos_e1_size'] == opt['pos_e2_size']

    helper.check_dir(opt['save_dir'])
    helper.print_config(opt)

    PCNN_ATT_model = PCNN_ATT(word_vec, opt)
    PCNN_ATT_model.cuda()

    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(PCNN_ATT_model.parameters(), lr=opt['lr'])

    start_time = time.time()

    print "Training starts."

    for epoch in xrange(opt['num_epoch']):

        print 'The running time of epoch %d:' % (epoch),

        total_loss = torch.Tensor([0]).cuda()

        if opt['trial']:
            train_part = all_data.bags_train.keys()[:opt['num_trial']]
        else:
            train_part = all_data.bags_train.keys()[:]

        shuffle(train_part)

        for index, bag_name in enumerate(train_part):

            # if index % 10000 == 0:
            #     print 'index == ', index

            optimizer.zero_grad()

            sentence_list = all_data.bags_train[bag_name]

            target = int(all_data.train_rel[sentence_list[0]])

            try:
                log_probs = PCNN_ATT_model(sentence_list, target, all_data)
            except:
                print index, len(sentence_list)
                raise

            target = autograd.Variable(torch.LongTensor([target]).cuda())

            loss = loss_function(log_probs, target)

            loss.backward()
            optimizer.step()

            total_loss += loss.data

        # Eval and get the AUC
        recall, precision = PCNN_ATT_model.test(all_data)
        test_AUC = metrics.auc(recall, precision)

        # Save parameters in each epoch
        model_file = opt['save_dir'] + '/checkpoint_epoch_%s.tar' % epoch

        torch.save({
            'state_dict': PCNN_ATT_model.state_dict(),
        }, model_file)

        best_file = opt['save_dir'] + '/best_model.tar'

        if epoch == 0 or best_AUC < test_AUC:

            best_AUC = test_AUC

            torch.save({
                'state_dict': PCNN_ATT_model.state_dict(),
            }, best_file)

        stop_time = time.time()
        print '%f; the total loss: %f; the AUC of P/R curve: %f' % (
            stop_time - start_time, total_loss.cpu().numpy()[0], test_AUC)
        start_time = stop_time
Exemple #8
0
def main():
    # set top-level random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    if args.cpu:
        args.cuda = False
    elif args.cuda:
        # force random seed for reproducibility
        # also apply same seed to numpy in every file
        torch.backends.cudnn.deterministic = True
        torch.cuda.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    # make opt
    opt = vars(args)
    opt['num_class'] = len(constant.LABEL_TO_ID)

    # load vocab
    vocab_file = opt['vocab_dir'] + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)

    # in some previous experiments we saw that lower vocab size can improve performance
    # but it was in a completely different project although on the same data
    # here it seems it's much harder to get this to work
    # uncomment the following line if this is solved:
    # new_vocab_size = 30000

    opt['vocab_size'] = vocab.size
    emb_file = opt['vocab_dir'] + '/embedding.npy'
    emb_matrix = np.load(emb_file)
    assert emb_matrix.shape[0] == vocab.size
    assert emb_matrix.shape[1] == opt['emb_dim']

    # load data
    print("Loading data from {} with batch size {}...".format(
        opt['data_dir'], opt['batch_size']))
    train_batch = DataLoader(opt['data_dir'] + '/train.json',
                             opt['batch_size'],
                             opt,
                             vocab,
                             evaluation=False)
    dev_batch = DataLoader(opt['data_dir'] + '/dev.json',
                           opt['batch_size'],
                           opt,
                           vocab,
                           evaluation=True)

    model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_save_dir = opt['save_dir'] + '/' + model_id
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
    vocab.save(model_save_dir + '/vocab.pkl')
    file_logger = helper.FileLogger(
        model_save_dir + '/' + opt['log'],
        header="# epoch\ttrain_loss\tdev_loss\tdev_p\tdev_r\tdev_f1")

    # print model info
    helper.print_config(opt)

    # model
    model = RelationModel(opt, emb_matrix=emb_matrix)

    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
    dev_f1_history = []
    current_lr = opt['lr']

    global_step = 0

    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    # setup the scheduler for lr decay
    # this doesn't seem to work well compared to what we already have
    # scheduler = ReduceLROnPlateau(model.optimizer, mode='min', factor=opt['lr_decay'], patience=1)

    # start training
    for epoch in range(1, opt['num_epoch'] + 1):
        # TODO: if lr warmup is used, the lr console output is not updated
        print(
            "Current params: " + " heads-" + str(opt["n_head"]) +
            " enc_layers-" + str(opt["num_layers_encoder"]),
            " drop-" + str(opt["dropout"]) + " scaled_drop-" +
            str(opt["scaled_dropout"]) + " lr-" + str(opt["lr"]),
            " lr_decay-" + str(opt["lr_decay"]) + " max_grad_norm-" +
            str(opt["max_grad_norm"]))
        print(
            " weight_no_rel-" + str(opt["weight_no_rel"]) + " weight_rest-" +
            str(opt["weight_rest"]) + " attn-" + str(opt["attn"]) +
            " attn_dim-" + str(opt["attn_dim"]),
            " obj_sub_pos-" + str(opt["obj_sub_pos"]) + " new_residual-" +
            str(opt["new_residual"]))
        print(
            " use_batch_norm-" + str(opt["use_batch_norm"]) +
            " relative_positions-" + str(opt["relative_positions"]),
            " decay_epoch-" + str(opt["decay_epoch"]) + " use_lemmas-" +
            str(opt["use_lemmas"]), " hidden_self-" + str(opt["hidden_self"]))

        train_loss = 0
        for i, batch in enumerate(train_batch):

            start_time = time.time()
            global_step += 1

            loss = model.update(batch)
            train_loss += float(loss)

            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(
                    format_str.format(datetime.now(), global_step, max_steps,
                                      epoch, opt['num_epoch'], loss, duration,
                                      current_lr))
            # do garbage collection,
            # as per https://discuss.pytorch.org/t/best-practices-for-maximum-gpu-utilization/13863/6
            del loss

        # eval on dev
        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss = model.predict(batch)
            predictions += preds
            dev_loss += float(loss)
            del loss

        predictions = [id2label[p] for p in predictions]
        dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)

        train_loss = train_loss / train_batch.num_examples * opt[
            'batch_size']  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']
        print(
            "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch, \
                                                                                       train_loss, dev_loss, dev_f1)
        )
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_p, dev_r, dev_f1))

        # save
        model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
        model.save(model_file, epoch)
        if epoch == 1 or dev_f1 > max(dev_f1_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            print("new best model saved.")
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        # reduce learning rate if it stagnates by a certain decay rate and within given epoch patience
        # this for some reason works worth than the implementation we have afterwards
        # scheduler.step(dev_loss)

        if opt["optim"] != "noopt_adam" and opt["optim"] != "noopt_nadam":

            # do warm_up_for sgd only instead of adam
            do_warmup_trick = False

            if do_warmup_trick:
                # print("do_warmup_trick")

                # 1 and 5 first worked kind of
                # 10 and 15
                current_lr = 10 * (360**(-0.5) *
                                   min(epoch**(-0.5), epoch * 15**(-1.5)))
                # print("current_lr", current_lr)
                model.update_lr(current_lr)

            else:
                # decay schedule # 15 is best!
                # simulate patience of x epochs
                if len(dev_f1_history
                       ) > opt['decay_epoch'] and dev_f1 <= dev_f1_history[-1]:
                    current_lr *= opt['lr_decay']
                    model.update_lr(current_lr)

        # else, update the learning rate in torch_utils.py

        dev_f1_history += [dev_f1]
        print("")

    print("Training ended with {} epochs.".format(epoch))
Exemple #9
0
def train_model(vocab_params,
                train_params,
                train_batch,
                dev_batch,
                model_id=-1):
    torch.manual_seed(train_params.seed)
    np.random.seed(train_params.seed)
    random.seed(train_params.seed)

    if train_params.cpu:
        train_params.cuda = False
    elif train_params.cuda:
        torch.cuda.manual_seed(train_params.seed)

    # make opt
    opt = vars(vocab_params)

    print(constant.LABEL_TO_ID)
    print(opt)
    opt['num_class'] = len(constant.LABEL_TO_ID)
    #     Combine all the parameters together
    opt.update(vars(train_params))

    # load vocab
    vocab_file = opt['vocab_dir'] + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    opt['vocab_size'] = vocab.size
    emb_file = opt['vocab_dir'] + '/embedding.npy'
    emb_matrix = np.load(emb_file)
    assert emb_matrix.shape[0] == vocab.size
    assert emb_matrix.shape[1] == opt['emb_dim']

    if (model_id == -1):
        model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_save_dir = opt['save_dir'] + '/' + model_id
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
    vocab.save(model_save_dir + '/vocab.pkl')
    file_logger = helper.FileLogger(
        model_save_dir + '/' + opt['log'],
        header="# epoch\ttrain_loss\tdev_loss\tdev_f1")

    # print model info
    helper.print_config(opt)

    # model
    model = RelationModel(opt, emb_matrix=emb_matrix)

    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
    dev_f1_history = []
    current_lr = opt['lr']

    global_step = 0
    global_start_time = time.time()
    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    # start training
    for epoch in range(1, opt['num_epoch'] + 1):
        train_loss = 0
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = model.update(batch)
            train_loss += loss
            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now(), global_step, max_steps, epoch,\
                        opt['num_epoch'], loss, duration, current_lr))

        # eval on dev
        print("Evaluating on dev set...")
        predictions = []
        dev_loss = 0
        for i, batch in enumerate(dev_batch):
            preds, _, loss = model.predict(batch)
            predictions += preds
            dev_loss += loss
        predictions = [id2label[p] for p in predictions]
        dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)

        train_loss = train_loss / train_batch.num_examples * opt[
            'batch_size']  # avg loss per batch
        dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']
        print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\
                train_loss, dev_loss, dev_f1))
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_f1))

        # save
        model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
        model.save(model_file, epoch)
        if epoch == 1 or dev_f1 > max(dev_f1_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            print("new best model saved.")
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        # lr schedule
        if len(dev_f1_history) > 10 and dev_f1 <= dev_f1_history[-1] and \
                opt['optim'] in ['sgd', 'adagrad']:
            current_lr *= opt['lr_decay']
            model.update_lr(current_lr)

        dev_f1_history += [dev_f1]
        print("")

    print("Training ended with {} epochs.".format(epoch))
Exemple #10
0
def main():
    args = get_parser()

    # set seed and prepare for training
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cpu:
        args.cuda = False
    elif args.cuda:
        torch.cuda.manual_seed(args.seed)
    init_time = time.time()

    # make opt
    opt = vars(args)
    TEXT, train_batch, dev_batch = load_data(opt['batch_size'],
                                             device='cuda:0')

    vocab = TEXT.vocab
    opt['vocab_size'] = len(vocab.stoi)
    emb_matrix = vocab.vectors

    assert emb_matrix.shape[0] == opt['vocab_size']
    assert emb_matrix.shape[1] == opt['emb_dim']

    model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
    model_save_dir = opt['save_dir'] + '/' + str(model_id)
    opt['model_save_dir'] = model_save_dir
    helper.ensure_dir(model_save_dir, verbose=True)

    # save config
    path = os.path.join(model_save_dir, 'config.json')
    helper.save_config(opt, path, verbose=True)
    # vocab.save(os.path.join(model_save_dir, 'vocab.pkl'))
    file_logger = helper.FileLogger(
        os.path.join(model_save_dir, opt['log']),
        header="# epoch\ttrain_loss\tdev_loss\tdev_score\tbest_dev_score")

    # print model info
    helper.print_config(opt)

    # Build Model
    if not opt['load']:
        trainer = LSTMTrainer(opt, emb_matrix)
    else:
        model_file = opt['model_file']
        print("Loading model from {}".format(model_file))
        model_opt = torch_utils.load_config(model_file)
        model_opt['optim'] = opt['optim']
        trainer = LSTMTrainer(model_opt)
        trainer.load(model_file)

    dev_score_history = []
    current_lr = opt['lr']

    global_step = 0
    global_start_time = time.time()
    format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
    max_steps = len(train_batch) * opt['num_epoch']

    # start training
    for epoch in range(1, opt['num_epoch'] + 1):
        train_loss = 0
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch)
            train_loss += loss
            if global_step % opt['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now(), global_step, max_steps, epoch, \
                    opt['num_epoch'], loss, duration, current_lr))

        # eval on dev
        print("Evaluating on dev set ...")
        predictions = []
        golds = []
        dev_loss = 0.0
        for i, batch in enumerate(dev_batch):
            preds, probs, labels, loss = trainer.predict(batch)
            predictions += preds
            golds += labels
            dev_loss += loss
        train_loss = train_loss / len(train_batch)
        dev_loss = dev_loss / len(dev_batch)
        # print(golds)
        # print(predictions)
        print(accuracy_score(golds, predictions))
        dev_roc = roc_auc_score(golds, predictions)
        print(
            "epoch {}: train loss = {:.6f}, dev loss = {:.6f}, dev roc = {:.4f}"
            .format(epoch, train_loss, dev_loss, dev_roc))
        dev_score = dev_roc
        file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}".format(
            epoch, train_loss, dev_loss, dev_score,
            max([dev_score] + dev_score_history)))

        # save model
        model_file = os.path.join(model_save_dir,
                                  "checkpoint_epoch_{}.py".format(epoch))
        trainer.save(model_file, epoch)
        if epoch == 1 or dev_score > max(dev_score_history):
            copyfile(model_file, model_save_dir + '/best_model.pt')
            print("new best model saved.")
            file_logger.log("new best model saved at epoch {}: {:.2f}"\
                .format(epoch, dev_score*100))
        if epoch % opt['save_epoch'] != 0:
            os.remove(model_file)

        if len(dev_score_history) > opt['decay_epoch'] and dev_score <= dev_score_history[-1] and \
            opt['optim'] in ['sgd', 'adagrad', 'adadelta']:
            current_lr *= opt['lr_decay']
            trainer.update_lr(current_lr)

        dev_score_history += [dev_score]
        print("")

    print("Training ended with {} epochs.".format(epoch))
Exemple #11
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir',
                        type=str,
                        default='saved_models/',
                        help='Directory of the model.')
    parser.add_argument('--model_name',
                        type=str,
                        default='best_model.tar',
                        help='Name of the model file.')
    parser.add_argument('--data_dir', type=str, default='data/')
    parser.add_argument('--out',
                        type=str,
                        default='',
                        help="Save model predictions to this dir.")

    parser.add_argument('--emb_dim',
                        type=int,
                        default=50,
                        help='Word embedding dimension.')
    parser.add_argument('--pos_dim',
                        type=int,
                        default=5,
                        help='Position embedding dimension.')
    parser.add_argument('--pos_limit',
                        type=int,
                        default=30,
                        help='Position embedding length limit.')
    parser.add_argument('--num_conv',
                        type=int,
                        default=230,
                        help='The number of convolutional filters.')
    parser.add_argument('--win_size',
                        type=int,
                        default=3,
                        help='Convolutional filter size.')
    parser.add_argument(
        '--dropout',
        type=float,
        default=0.5,
        help='The rate at which randomly set a parameter to 0.')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='Applies to SGD.')
    parser.add_argument('--num_epoch', type=int, default=15)
    parser.add_argument('--seed', type=int, default=666)

    parser.add_argument('--sentential_eval',
                        type=bool,
                        default=False,
                        help='Perform sentential evaluation.')
    parser.add_argument('--sen_file',
                        type=str,
                        default='',
                        help='Sentential eval dataset.')

    parser.add_argument('--heldout_eval',
                        type=bool,
                        default=False,
                        help='Perform heldout evaluation after each epoch.')
    parser.add_argument('--print_config',
                        type=bool,
                        default=False,
                        help='Print out the configuration of the model.')

    parser.add_argument(
        '--tune',
        type=bool,
        default=False,
        help=
        'Perform sentential evaluation for all models in the same directory.')

    parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
    # parser.add_argument('--gpu_num', type=int, default=0)
    parser.add_argument('--cpu', action='store_true', help='Ignore CUDA.')
    args = parser.parse_args()

    if args.cpu:
        args.cuda = False

    # make opt
    opt = vars(args)

    opt['train_file'] = opt['data_dir'] + '/' + 'train.txt'
    opt['test_file'] = opt['data_dir'] + '/' + 'test.txt'
    opt['vocab_file'] = opt['data_dir'] + '/' + 'vec.bin'
    opt['rel_file'] = opt['data_dir'] + '/' + 'relation2id.txt'
    if opt['data_dir'].split('/')[-1] != '':
        opt['data_name'] = opt['data_dir'].split('/')[-1]
    else:
        opt['data_name'] = opt['data_dir'].split('/')[-2]

    # Pretrained word embedding
    print "\nPretrained word embedding loaded"
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
        opt['vocab_file'], binary=True)
    word_list = [u'UNK'] + w2v_model.index2word
    word_vec = w2v_model.syn0

    word2id = {}

    for id, word in enumerate(word_list):
        word2id[word] = id

    assert opt['emb_dim'] == w2v_model.syn0.shape[1]

    # Read from relation2id.txt to build a dictionary: rel2id
    rel2id = {}

    with open(opt['rel_file'], 'rb') as f:
        for item in f:
            [relation, id] = item.strip('\n').split(' ')
            rel2id[relation] = int(id)

    id2rel = [''] * len(rel2id)
    for relation, rel_id in rel2id.items():
        id2rel[rel_id] = relation

    opt['num_rel'] = len(rel2id)
    opt['vocab_size'] = len(word_list)

    # Load data
    all_data = loader.DataLoader(opt, word2id, rel2id)
    opt['pos_e1_size'] = all_data.pos_max_e1 - all_data.pos_min_e1 + 1
    opt['pos_e2_size'] = all_data.pos_max_e2 - all_data.pos_min_e2 + 1
    opt['pos_min_e1'] = all_data.pos_min_e1
    opt['pos_min_e2'] = all_data.pos_min_e2
    opt['EP_num_train'] = len(all_data.bags_train)
    opt['EP_num_test'] = len(all_data.bags_test)

    assert opt['pos_e1_size'] == opt['pos_e2_size']

    if opt['tune']:
        model_file_list = sorted(
            glob.glob(args.model_dir + opt['data_name'] + "*.tar"))
    else:
        model_file_list = [args.model_dir + '/' + args.model_name]

    for model_file in model_file_list:

        # Load input model
        print("Load model: {}".format(model_file.split('/')[-1]))
        PCNN_NMAR_model = PCNN_NMAR(word_vec, opt)
        checkpoint = torch.load(model_file)
        PCNN_NMAR_model.load_state_dict(checkpoint['state_dict'])
        model_config = torch.load(model_file)['config']

        if opt['print_config']:
            helper.print_config(model_config)

        if opt['cuda']:
            PCNN_NMAR_model.cuda()

        # Sentential evaluation
        if opt['sentential_eval']:

            print "Sentential evaluaiton starts."

            sen_file = opt['data_dir'] + '/' + opt['sen_file']
            sen_AUC = PCNN_NMAR_model.sentential_eval(sen_file, all_data,
                                                      rel2id, id2rel)
            print "The sentential AUC of P/R curve on {} is {:.3f}".format(
                opt['sen_file'], sen_AUC)

            print "Sentential evaluaiton ends.\n"

        # Heldout evaluation
        if opt['heldout_eval']:

            print "Heldout evaluation starts."

            recall, precision = PCNN_NMAR_model.heldout_eval(all_data)
            heldout_AUC = metrics.auc(recall,
                                      precision) if len(recall) != 0 else 0
            print "The heldout AUC of P/R curve is {:.4f}".format(heldout_AUC)

            print "Heldout evaluaiton ends."