Ejemplo n.º 1
0
def main():
    # load settings for training
    opt = init_opt(description='train.py')
    logging = init_logging(logger_name='train.py', log_file=opt.log_file, redirect_to_stdout=False)

    logging.info('EXP_PATH : ' + opt.exp_path)

    logging.info('Parameters:')
    [logging.info('%s    :    %s' % (k, str(v))) for k, v in opt.__dict__.items()]

    logging.info('======================  Checking GPU Availability  =========================')
    if torch.cuda.is_available():
        if isinstance(opt.device_ids, int):
            opt.device_ids = [opt.device_ids]
        logging.info('Running on %s! devices=%s' % ('MULTIPLE GPUs' if len(opt.device_ids) > 1 else '1 GPU', str(opt.device_ids)))
    else:
        logging.info('Running on CPU!')

    try:
        train_data_loader, valid_data_loader, test_data_loader, word2id, id2word, vocab = load_data_vocab(opt)
        model = init_model(opt)
        optimizer_ml, optimizer_rl, criterion = init_optimizer_criterion(model, opt)
        train_model(model, optimizer_ml, optimizer_rl, criterion, train_data_loader, valid_data_loader, test_data_loader, opt)
    except Exception as e:
        logging.error(e, exc_info=True)
        raise
 def __init__(self):
     super(KeyphrasePredictor, self).__init__()
     self.model_opts = config.init_opt(description='predictor')
     # self.vocab_path = self.model_opts.vocab#os.path.join(self.model_opts.data, 'kp20k', 'kp20k.vocab.pt')
     # parser = argparse.ArgumentParser(description='predictor',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     # config.preprocess_opts(parser)
     # self.opt = parser.parse_args([])
     self.load()
def main():
    opt = config.init_opt(description='predict_keyphrase.py')
    logger = config.init_logging('predict_keyphrase',
                                 opt.exp_path + '/output.log',
                                 redirect_to_stdout=False)

    logger.info('EXP_PATH : ' + opt.exp_path)

    logger.info('Parameters:')
    [
        logger.info('%s    :    %s' % (k, str(v)))
        for k, v in opt.__dict__.items()
    ]

    logger.info(
        '======================  Checking GPU Availability  ========================='
    )
    if torch.cuda.is_available():
        if isinstance(opt.device_ids, int):
            opt.device_ids = [opt.device_ids]
        logger.info('Running on %s! devices=%s' %
                    ('MULTIPLE GPUs' if len(opt.device_ids) > 1 else '1 GPU',
                     str(opt.device_ids)))
    else:
        logger.info('Running on CPU!')

    try:
        one2one, one2many = generate_dataset()
        test_data_loaders, word2id, id2word, vocab = load_vocab_and_testsets(
            opt, one2one, one2many)
        model = init_model(opt)
        generator = SequenceGenerator(model,
                                      eos_id=opt.word2id[pykp.io.EOS_WORD],
                                      beam_size=opt.beam_size,
                                      max_sequence_length=opt.max_sent_length)

        for testset_name, test_data_loader in zip(['kp20k'],
                                                  test_data_loaders):
            logger.info('Evaluating %s' % testset_name)
            output = predict_beam_search(
                generator,
                test_data_loader,
                opt,
                title='test_%s' % testset_name,
                predict_save_path=None
            )  #opt.pred_path + '/%s_test_result/' % (testset_name))
            print(output)
    except Exception as e:
        logger.error(e, exc_info=True)
Ejemplo n.º 4
0
def main():
    # load settings for training
    opt = init_opt(description='train.py')
    logging = init_logging(logger_name='train.py',
                           log_file=opt.log_file,
                           redirect_to_stdout=False)

    logging.info('EXP_PATH : ' + opt.exp_path)

    logging.info('Parameters:')
    [
        logging.info('%s    :    %s' % (k, str(v)))
        for k, v in opt.__dict__.items()
    ]

    logging.info(
        '======================  Checking GPU Availability  ========================='
    )
    logger.info("torch.cuda.is_available()={}".format(
        torch.cuda.is_available()))

    if torch.cuda.is_available():
        if isinstance(opt.gpuid, int):
            opt.gpuid = [opt.gpuid]
        logging.info('Running on %s! devices=%s' %
                     ('MULTIPLE GPUs' if len(opt.gpuid) > 1 else '1 GPU',
                      str(opt.gpuid)))
    else:
        logging.info('Running on CPU!')

    try:
        train_data_loader, valid_data_loader, _, _, _, _ = load_data_vocab_for_training(
            opt)
        # ignore the previous test_data_loader
        valid_data_loaders, _, _, _ = load_vocab_and_datasets_for_testing(
            dataset_names=opt.test_dataset_names, type='valid', opt=opt)
        test_data_loaders, _, _, _ = load_vocab_and_datasets_for_testing(
            dataset_names=opt.test_dataset_names, type='test', opt=opt)
        model = init_model(opt)
        optimizer_ml, optimizer_rl, criterion = init_optimizer_criterion(
            model, opt)
        train_model(model, optimizer_ml, optimizer_rl, criterion,
                    train_data_loader, valid_data_loaders, test_data_loaders,
                    opt)
    except Exception as e:
        logging.error(e, exc_info=True)
        raise
Ejemplo n.º 5
0
def main():
    # load settings for training
    opt = init_opt(description='train.py')

    opt.useGpu = 1

    opt.encoder_type = 'bert'

    opt.useCLF = True

    opt.data = 'data/kp20k/kp20k'
    opt.vocab = 'data/kp20k/kp20k.vocab.pt'

    if opt.encoder_type == 'transformer':
        opt.batch_size = 32
        opt.d_inner = 2048
        opt.enc_n_layers = 4
        opt.dec_n_layers = 2
        opt.n_head = 8
        opt.d_k = 64
        opt.d_v = 64
        opt.d_model = 512
        opt.word_vec_size = 512
        opt.run_valid_every = 5000000
        opt.save_model_every = 20000
        opt.decode_old = True
        #opt.copy_attention = False
    elif opt.encoder_type == 'bert':
        opt.useOnlyTwo = False
        opt.avgHidden = False
        opt.useZeroDecodeHidden = True
        opt.useSameEmbeding = False
        opt.batch_size = 10
        opt.max_sent_length = 10
        opt.run_valid_every = 40000
        opt.decode_old = False
        opt.beam_search_batch_size = 10
        opt.bert_model = 'bert-base-uncased'
        opt.tokenizer = BertTokenizer.from_pretrained(opt.bert_model)
    else:
        opt.enc_layers = 2
        opt.bidirectional = True
        opt.decode_old = True
        opt.run_valid_every = 2

    opt.onlyTest = False
    if opt.onlyTest:
        opt.train_ml = False
        opt.run_valid_every = 5
        opt.beam_size = 64
        opt.beam_search_batch_size = 128
        #opt.train_from = 'exp/kp20k.ml.copy.20181129-193506/model/kp20k.ml.copy.epoch=1.batch=20000.total_batch=20000.model'
        opt.train_from = 'exp/kp20k.ml.copy.20181128-153121/model/kp20k.ml.copy.epoch=2.batch=15495.total_batch=38000.model'
        #opt.train_from = 'exp/kp20k.ml.copy.20181117-190121/model/kp20k.ml.copy.epoch=3.batch=14172.total_batch=56000.model'

    logging = init_logging(logger_name='train.py',
                           log_file=opt.log_file,
                           redirect_to_stdout=False)

    logging.info('EXP_PATH : ' + opt.exp_path)

    logging.info('Parameters:')
    [
        logging.info('%s    :    %s' % (k, str(v)))
        for k, v in opt.__dict__.items()
    ]

    logging.info(
        '======================  Checking GPU Availability  ========================='
    )
    if torch.cuda.is_available() and opt.useGpu:
        if isinstance(opt.gpuid, int):
            opt.gpuid = [opt.gpuid]
        logging.info('Running on %s! devices=%s' %
                     ('MULTIPLE GPUs' if len(opt.gpuid) > 1 else '1 GPU',
                      str(opt.gpuid)))
    else:
        logging.info('Running on CPU!')

    try:
        train_data_loader, valid_data_loader, test_data_loader, word2id, id2word, vocab = load_data_vocab(
            opt)
        model = init_model(opt)
        if torch.cuda.is_available() and opt.useGpu:
            model.cuda()
        print("model:")
        print(model)
        print()
        optimizer_ml, optimizer_rl, criterion = init_optimizer_criterion(
            model, opt)
        if torch.cuda.is_available() and opt.useGpu:
            criterion.cuda()

        train_model(model, optimizer_ml, optimizer_rl, criterion,
                    train_data_loader, valid_data_loader, test_data_loader,
                    opt)
    except Exception as e:
        logging.error(e, exc_info=True)
        raise
Ejemplo n.º 6
0
def main():
    opt = config.init_opt(description='predict.py')
    logger = config.init_logging('predict',
                                 opt.exp_path + '/output.log',
                                 redirect_to_stdout=False)

    logger.info('EXP_PATH : ' + opt.exp_path)

    logger.info('Parameters:')
    [
        logger.info('%s    :    %s' % (k, str(v)))
        for k, v in opt.__dict__.items()
    ]

    logger.info(
        '======================  Checking GPU Availability  ========================='
    )
    if torch.cuda.is_available():
        if isinstance(opt.device_ids, int):
            opt.device_ids = [opt.device_ids]
        logger.info('Running on %s! devices=%s' %
                    ('MULTIPLE GPUs' if len(opt.device_ids) > 1 else '1 GPU',
                     str(opt.device_ids)))
    else:
        logger.info('Running on CPU!')

    try:
        valid_data_loaders, word2id, id2word, vocab = load_vocab_and_datasets_for_testing(
            dataset_names=opt.test_dataset_names, type='valid', opt=opt)
        test_data_loaders, _, _, _ = load_vocab_and_datasets_for_testing(
            dataset_names=opt.test_dataset_names, type='test', opt=opt)

        opt.word2id = word2id
        opt.id2word = id2word
        opt.vocab = vocab

        model = init_model(opt)
        generator = SequenceGenerator(model,
                                      eos_id=opt.word2id[pykp.io.EOS_WORD],
                                      beam_size=opt.beam_size,
                                      max_sequence_length=opt.max_sent_length)

        valid_score_dict = evaluate_multiple_datasets(
            generator,
            valid_data_loaders,
            opt,
            title='valid',
            predict_save_path=opt.pred_path)
        test_score_dict = evaluate_multiple_datasets(
            generator,
            test_data_loaders,
            opt,
            title='test',
            predict_save_path=opt.pred_path)

        # test_data_loaders, word2id, id2word, vocab = load_vocab_and_datasets(opt)
        # for testset_name, test_data_loader in zip(opt.test_dataset_names, test_data_loaders):
        #     logger.info('Evaluating %s' % testset_name)
        #     evaluate_beam_search(generator, test_data_loader, opt,
        #                          title='test_%s' % testset_name,
        #                          predict_save_path=opt.pred_path + '/%s_test_result/' % (testset_name))

    except Exception as e:
        logger.error(e, exc_info=True)
Ejemplo n.º 7
0
def main():
    opt = config.init_opt(description='predict.py')

    opt.data = 'data3/kp20k/kp20k'
    opt.vocab = 'data3/kp20k/kp20k.vocab.pt'
    #opt.train_from = 'exp/kp20k.ml.copy.20181129-193506/model/kp20k.ml.copy.epoch=1.batch=20000.total_batch=20000.model'
    opt.train_from = 'exp/kp20k.ml.copy.20181128-153121/model/kp20k.ml.copy.epoch=2.batch=15495.total_batch=38000.model'

    opt.useGpu = 0
    opt.encoder_type = 'rnn'

    opt.useCLF = False

    if opt.encoder_type.startswith('transformer'):
        opt.batch_size = 32
        opt.d_inner = 2048
        opt.enc_n_layers = 4
        opt.dec_n_layers = 2
        opt.n_head = 8
        opt.d_k = 64
        opt.d_v = 64
        opt.d_model = 512
        opt.word_vec_size = 512
        opt.run_valid_every = 5000000
        opt.save_model_every = 20000
        opt.decode_old = True
        # opt.copy_attention = False
    elif opt.encoder_type.startswith('bert'):
        opt.useOnlyTwo = False
        opt.avgHidden = True
        opt.useZeroDecodeHidden = False
        opt.useSameEmbeding = False
        opt.batch_size = 10
        opt.max_sent_length = 10
        opt.run_valid_every = 20000
        opt.decode_old = False
        opt.beam_search_batch_size = 10
        opt.bert_model = 'bert-base-uncased'
        opt.tokenizer = BertTokenizer.from_pretrained(opt.bert_model)
        if opt.encoder_type == 'bert_low':
            opt.copy_attention = False
    else:
        opt.enc_layers = 2
        opt.bidirectional = True
        opt.decode_old = True

    logger = config.init_logging('predict',
                                 opt.exp_path + '/output.log',
                                 redirect_to_stdout=False)

    logger.info('EXP_PATH : ' + opt.exp_path)

    logger.info('Parameters:')
    [
        logger.info('%s    :    %s' % (k, str(v)))
        for k, v in opt.__dict__.items()
    ]

    logger.info(
        '======================  Checking GPU Availability  ========================='
    )
    if torch.cuda.is_available() and opt.useGpu:
        if isinstance(opt.gpuid, int):
            opt.gpuid = [opt.gpuid]
        logger.info('Running on %s! devices=%s' %
                    ('MULTIPLE GPUs' if len(opt.gpuid) > 1 else '1 GPU',
                     str(opt.gpuid)))
    else:
        logger.info('Running on CPU!')

    try:
        test_data_loaders, word2id, id2word, vocab = load_vocab_and_testsets(
            opt)
        model = init_model(opt)
        if torch.cuda.is_available() and opt.useGpu:
            model.cuda()

        generator = SequenceGenerator(model,
                                      opt.word_vec_size if opt.encoder_type
                                      == 'transformer' else opt.vocab_size,
                                      eos_id=opt.word2id[pykp.io.EOS_WORD],
                                      beam_size=opt.beam_size,
                                      max_sequence_length=opt.max_sent_length,
                                      useGpu=opt.useGpu)

        for testset_name, test_data_loader in zip(opt.test_dataset_names,
                                                  test_data_loaders):
            logger.info('Evaluating %s' % testset_name)
            evaluate_beam_search(generator,
                                 test_data_loader,
                                 opt,
                                 title='test_%s' % testset_name,
                                 predict_save_path=opt.pred_path +
                                 '/%s_test_result/' % (testset_name))

    except Exception as e:
        logger.error(e, exc_info=True)