Esempio n. 1
0
def main():
    # load settings for training
    parser = argparse.ArgumentParser(
        description='train.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    config.preprocess_opts(parser)
    config.model_opts(parser)
    config.train_opts(parser)
    config.predict_opts(parser)
    config.transformer_opts(parser)

    opt = parser.parse_args()
    opt = process_opt(opt)
    opt.input_feeding = False
    opt.copy_input_feeding = False

    logging = config.init_logging(logger_name=None,
                                  log_file=opt.exp_path + '/output.log',
                                  stdout=True)
    try:
        # print(opt.bidirectional)
        # exit(0)
        # opt.train_from = 'model/kp20k.ml.copy.uni-directional.20180817-021054/kp20k.ml.copy.uni-directional.epoch=6.batch=6735.total_batch=57300.model'
        train_data_loader, word2id, id2word, vocab, eval_dataloader = load_data_vocab(
            opt)
        model = init_model(opt)

        optimizer_ml, _, criterion = init_optimizer_criterion(model, opt)
        train_model(model, optimizer_ml, _, criterion, train_data_loader, opt,
                    eval_dataloader)
    except Exception as e:
        logging.exception("message")
def main():
    # load settings for training
    parser = argparse.ArgumentParser(
        description='train.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    config.preprocess_opts(parser)
    config.model_opts(parser)
    config.train_opts(parser)
    config.predict_opts(parser)
    opt = parser.parse_args()
    opt = process_opt(opt)
    opt.input_feeding = False
    opt.copy_input_feeding = False

    logging = config.init_logging(logger_name=None, log_file=opt.exp_path + '/output.log', stdout=True)

    logging.info('Parameters:')
    [logging.info('%s    :    %s' % (k, str(v))) for k, v in opt.__dict__.items()]

    try:
        train_data_loader, valid_data_loader, test_data_loader, word2id, id2word, vocab = load_data_vocab(opt)
        model = init_model(opt)
        optimizer_ml, optimizer_rl, criterion = init_optimizer_criterion(model, opt)
        train_model(model, optimizer_ml, optimizer_rl, criterion, train_data_loader, valid_data_loader, test_data_loader, opt)
    except Exception as e:
        logging.exception("message")
Esempio n. 3
0
def main():
    # load settings for training
    parser = argparse.ArgumentParser(
        description='predict.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    config.preprocess_opts(parser)
    config.model_opts(parser)
    config.train_opts(parser)
    config.predict_opts(parser)
    config.transformer_opts(parser)
    opt = parser.parse_args()

    if opt.seed > 0:
        torch.manual_seed(opt.seed)

    # print(opt.gpuid)
    if torch.cuda.is_available() and not opt.gpuid:
        opt.gpuid = 0

    opt.exp = 'predict.' + opt.exp
    if hasattr(opt, 'copy_model') and opt.copy_model:
        opt.exp += '.copy'

    if hasattr(opt, 'bidirectional'):
        if opt.bidirectional:
            opt.exp += '.bi-directional'
    else:
        opt.exp += '.uni-directional'

    # fill time into the name
    if opt.exp_path.find('%s') > 0:
        opt.exp_path = opt.exp_path % (opt.exp, opt.timemark)
        opt.pred_path = opt.pred_path % (opt.exp, opt.timemark)

    if not os.path.exists(opt.exp_path):
        os.makedirs(opt.exp_path)
    if not os.path.exists(opt.pred_path):
        os.makedirs(opt.pred_path)

    logging = config.init_logging(logger_name=None, log_file=opt.exp_path + '/output.log', stdout=True)
    
    
    try:
        opt.train_from = 'model/kp20k.ml.copy.bi-directional.20180908-054257/kp20k.ml.copy.bi-directional.epoch=9.batch=2932.model'
        test_data_loader, word2id, id2word, vocab = load_data_vocab(opt, load_train=False)
        model = init_model(opt)

        generator = SequenceGenerator(model,opt,
                                      eos_id=opt.word2id[pykp.io.EOS_WORD],
                                      beam_size=opt.beam_size,
                                      max_sequence_length=opt.max_sent_length,
                                      )
        
        evaluate_beam_search(generator, test_data_loader, opt, title='predict', save_path=opt.pred_path + '/[epoch=%d,batch=%d,total_batch=%d]test_result.csv' % (0, 0, 0))

    except Exception as e:
        logging.exception("message")
Esempio n. 4
0
def main():
    # load settings for training
    parser = argparse.ArgumentParser(
        description='train.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    config.preprocess_opts(parser)
    config.model_opts(parser)
    config.train_opts(parser)
    config.predict_opts(parser)
    opt = parser.parse_args()

    if opt.seed > 0:
        torch.manual_seed(opt.seed)

    print(opt.gpuid)
    if torch.cuda.is_available() and not opt.gpuid:
        opt.gpuid = 0

    if hasattr(opt, 'copy_model') and opt.copy_model:
        opt.exp += '.copy'

    if hasattr(opt, 'bidirectional'):
        if opt.bidirectional:
            opt.exp += '.bi-directional'
    else:
        opt.exp += '.uni-directional'

    # fill time into the name
    if opt.exp_path.find('%s') > 0:
        opt.exp_path = opt.exp_path % (opt.exp, opt.timemark)
        opt.save_path = opt.save_path % (opt.exp, opt.timemark)

    if not os.path.exists(opt.exp_path):
        os.makedirs(opt.exp_path)
    if not os.path.exists(opt.save_path):
        os.makedirs(opt.save_path)

    config.init_logging(opt.exp_path + '/output.log')

    logging.info('Parameters:')
    [
        logging.info('%s    :    %s' % (k, str(v)))
        for k, v in opt.__dict__.items()
    ]

    try:
        train_data_loader, valid_data_loader, test_data_loader, word2id, id2word, vocab = load_data_vocab(
            opt)
        model = init_model(opt)
        optimizer, criterion = init_optimizer_criterion(model, opt)
        train_model(model, optimizer, criterion, train_data_loader,
                    valid_data_loader, test_data_loader, opt)
    except Exception as e:
        logging.exception("message")
def main():
    # load settings for training
    parser = argparse.ArgumentParser(
        description='train.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    config.preprocess_opts(parser)
    config.model_opts(parser)
    config.train_opts(parser)
    config.predict_opts(parser)
    opt = parser.parse_args()

    if opt.seed > 0:
        torch.manual_seed(opt.seed)

    print(opt.gpuid)
    if torch.cuda.is_available() and not opt.gpuid:
        opt.gpuid = 0

    if hasattr(opt, 'copy_model') and opt.copy_model:
        opt.exp += '.copy'

    if hasattr(opt, 'bidirectional'):
        if opt.bidirectional:
            opt.exp += '.bi-directional'
    else:
        opt.exp += '.uni-directional'

    # fill time into the name
    if opt.exp_path.find('%s') > 0:
        opt.exp_path = opt.exp_path % (opt.exp, opt.timemark)
        opt.save_path = opt.save_path % (opt.exp, opt.timemark)

    if not os.path.exists(opt.exp_path):
        os.makedirs(opt.exp_path)
    if not os.path.exists(opt.save_path):
        os.makedirs(opt.save_path)

    config.init_logging(opt.exp_path + '/output.log')

    logging.info('Parameters:')
    [logging.info('%s    :    %s' % (k, str(v))) for k, v in opt.__dict__.items()]

    try:
        train_data_loader, valid_data_loader, test_data_loader, word2id, id2word, vocab = load_data_vocab(opt)
        model = init_model(opt)
        optimizer, criterion = init_optimizer_criterion(model, opt)
        train_model(model, optimizer, criterion, train_data_loader, valid_data_loader, test_data_loader, opt)
    except Exception as e:
        logging.exception("message")
Esempio n. 6
0
def main():
    # load settings for training
    parser = argparse.ArgumentParser(
        description='train.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    config.preprocess_opts(parser)
    config.model_opts(parser)
    config.train_opts(parser)
    config.predict_opts(parser)
    config.transformer_opts(parser)

    opt = parser.parse_args()
    opt = process_opt(opt)
    opt.input_feeding = False
    opt.copy_input_feeding = False

    logging = config.init_logging(logger_name=None,
                                  log_file=opt.exp_path + '/output.log',
                                  stdout=True)
    try:

        # opt.train_from = 'model/kp20k.ml.copy.bi-directional.20180901-025437/kp20k.ml.copy.bi-directional.epoch=9.batch=938.model'
        train_data_loader, word2id, id2word, vocab, eval_dataloader = load_data_vocab(
            opt)
        model = init_model(opt)
        # embedding=make_embedding(word2id,id2word)
        embedding = torch.load('embedding50004.pt')
        model.init_embedding(embedding)

        opt.learning_rate = 0.001
        optimizer_ml, criterion = init_optimizer_criterion(model, opt)
        train_model(model, optimizer_ml, criterion, train_data_loader, opt,
                    eval_dataloader)

    except Exception as e:
        logging.exception("message")
    def wrapper(*args, **kwargs):
        beg_ts = time.time()
        retval = func(*args, **kwargs)
        end_ts = time.time()
        print(fname, "elapsed time: %f" % (end_ts - beg_ts))
        return retval

    return wrapper
__author__ = "Rui Meng"
__email__ = "*****@*****.**"

# load settings for training
parser = argparse.ArgumentParser(
    description='train.py',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
config.preprocess_opts(parser)
config.model_opts(parser)
config.train_opts(parser)
opt = parser.parse_args()

if opt.seed > 0:
    torch.manual_seed(opt.seed)

print(opt.gpuid)
if torch.cuda.is_available() and not opt.gpuid:
    opt.gpuid = 0

# if opt.gpuid:
#     cuda.set_device(0)

# fill time into the name
import pykp.io

parser = argparse.ArgumentParser(
    description='preprocess.py',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# **Preprocess Options**
parser.add_argument('-dataset_name', required=True, help="Name of dataset")
parser.add_argument('-source_dataset_dir',
                    required=True,
                    help="The path to the source data (raw json).")
parser.add_argument('-output_path_prefix',
                    default='data',
                    help="Output file for the prepared data")

config.preprocess_opts(parser)
opt = parser.parse_args()

# input path of each json file
opt.source_train_file = os.path.join(opt.source_dataset_dir,
                                     '%s_training.json' % (opt.dataset_name))
opt.source_valid_file = os.path.join(opt.source_dataset_dir,
                                     '%s_validation.json' % (opt.dataset_name))
opt.source_test_file = os.path.join(opt.source_dataset_dir,
                                    '%s_testing.json' % (opt.dataset_name))

# output path for exporting the processed dataset
opt.output_path = os.path.join(opt.output_path_prefix, opt.dataset_name)
# output path for exporting the processed dataset
opt.subset_output_path = os.path.join(opt.output_path_prefix,
                                      opt.dataset_name + '_small')
Esempio n. 9
0
def main():
    # load settings for training
    parser = argparse.ArgumentParser(
        description='predict.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    config.preprocess_opts(parser)
    config.model_opts(parser)
    config.train_opts(parser)
    config.predict_opts(parser)
    opt = parser.parse_args()

    if opt.seed > 0:
        torch.manual_seed(opt.seed)

    print(opt.gpuid)
    if torch.cuda.is_available() and not opt.gpuid:
        opt.gpuid = 0

    opt.exp = 'predict.' + opt.exp
    if hasattr(opt, 'copy_model') and opt.copy_model:
        opt.exp += '.copy'

    if hasattr(opt, 'bidirectional'):
        if opt.bidirectional:
            opt.exp += '.bi-directional'
    else:
        opt.exp += '.uni-directional'

    # fill time into the name
    if opt.exp_path.find('%s') > 0:
        opt.exp_path = opt.exp_path % (opt.exp, opt.timemark)
        opt.pred_path = opt.pred_path % (opt.exp, opt.timemark)

    if not os.path.exists(opt.exp_path):
        os.makedirs(opt.exp_path)
    if not os.path.exists(opt.pred_path):
        os.makedirs(opt.pred_path)

    logging = config.init_logging('train', opt.exp_path + '/output.log')

    logging.info('Parameters:')
    [
        logging.info('%s    :    %s' % (k, str(v)))
        for k, v in opt.__dict__.items()
    ]

    try:
        train_data_loader, valid_data_loader, test_data_loader, word2id, id2word, vocab = load_data_vocab(
            opt, load_train=False)
        model = init_model(opt)
        # optimizer, criterion = init_optimizer_criterion(model, opt)

        generator = SequenceGenerator(model,
                                      eos_id=opt.word2id[pykp.io.EOS_WORD],
                                      beam_size=opt.beam_size,
                                      max_sequence_length=opt.max_sent_length)

        # import time
        # start_time = time.time()
        evaluate_beam_search(
            generator,
            test_data_loader,
            opt,
            title='predict',
            save_path=opt.pred_path +
            '/[epoch=%d,batch=%d,total_batch=%d]test_result.csv' % (0, 0, 0))
        # print("--- %s seconds --- Complete Beam Search" % (time.time() - start_time))

        # predict_greedy(model, test_data_loader, test_examples, opt)

    except Exception as e:
        logging.exception("message")
def generate_dataset():
    test_dataset_name = 'kp20k'
    src_fields = ['title', 'abstract']
    trg_fields = ['keyword']
    parser = argparse.ArgumentParser(
        description='preprocess_testset.py',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # **Preprocess Options**
    parser.add_argument('-source_dataset_root_dir',
                        default='test/',
                        help="The path to the source data (raw json).")

    parser.add_argument('-output_path_prefix',
                        default='data/',
                        help="Output file for the prepared data")

    config.preprocess_opts(parser)
    opt = parser.parse_args([])

    print("Loading Vocab...")
    opt.vocab_path = os.path.join(opt.output_path_prefix, 'kp20k',
                                  'kp20k.vocab.pt')
    print(os.path.abspath(opt.vocab_path))
    word2id, id2word, vocab = torch.load(opt.vocab_path, 'rb')
    print('Vocab size = %d' % len(vocab))

    # for test_dataset_name in test_dataset_names:
    opt.source_test_file = os.path.join(
        opt.source_dataset_root_dir, '%s_testing.json' % (test_dataset_name))

    # output path for exporting the processed dataset
    opt.output_path = os.path.join(opt.output_path_prefix, test_dataset_name)
    if not os.path.exists(opt.output_path):
        os.makedirs(opt.output_path)

    print("Loading test data...")

    tokenized_test_pairs = pykp.io.load_src_trgs_pairs(
        source_json_path=opt.source_test_file,
        dataset_name=test_dataset_name,
        src_fields=src_fields,
        trg_fields=trg_fields,
        valid_check=True,
        opt=opt)

    print("Exporting complete dataset")

    # pykp.io.process_and_export_dataset(tokenized_test_pairs,
    #                                    word2id, id2word,
    #                                    opt,
    #                                    opt.output_path,
    #                                    dataset_name=test_dataset_name,
    #                                    data_type='test')
    return pykp.io.process_dataset(tokenized_test_pairs,
                                   word2id,
                                   id2word,
                                   opt,
                                   opt.output_path,
                                   dataset_name=test_dataset_name,
                                   data_type='test')