Esempio n. 1
0
def train_transfer(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["pretraining"].get("random_seed", 42))

    # load the data
    pre_train_data, pre_dev_data, pre_test_data, pre_src_vocab, pre_trg_vocab = load_data(
        data_cfg=cfg["pretrained_data"])

    # build an encoder-decoder model
    pretrained_model = build_model(cfg["model"],
                                   src_vocab=pre_src_vocab,
                                   trg_vocab=pre_trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=pretrained_model,
                           config=cfg,
                           training_key="pretraining",
                           name_log="pre_train")

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=pre_train_data,
                  valid_data=pre_dev_data,
                  test_data=pre_test_data,
                  src_vocab=pre_src_vocab,
                  trg_vocab=pre_trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(pretrained_model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["pretraining"]["model_dir"])
    pre_src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["pretraining"]["model_dir"])
    pre_trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=pre_train_data,
                               valid_data=pre_dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file,
         ckpt=ckpt,
         output_path=output_path,
         logger=trainer.logger,
         key_training="pretraining",
         key_data="pretrained_data")

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_pretrained_model(cfg["model"],
                                   pretrained_model=pretrained_model,
                                   pretrained_src_vocab=pre_src_vocab,
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg, training_key="training")

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file,
         ckpt=ckpt,
         output_path=output_path,
         logger=trainer.logger,
         key_training="training",
         key_data="data")
Esempio n. 2
0
def train(cfg_file: str, skip_test: bool = False) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    :param skip_test: whether a test should be run or not after training
    """
    cfg = load_config(cfg_file)

    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                               overwrite=cfg["training"].get(
                                   "overwrite", False))
    _ = make_logger(model_dir, mode="train")  # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    if not skip_test:
        # predict with the best model on validation and test
        # (if test data is available)
        ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter)
        output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter)
        output_path = os.path.join(model_dir, output_name)
        datasets_to_test = {
            "dev": dev_data,
            "test": test_data,
            "src_vocab": src_vocab,
            "trg_vocab": trg_vocab
        }
        test(cfg_file,
             ckpt=ckpt,
             output_path=output_path,
             datasets=datasets_to_test)
    else:
        logger.info("Skipping test after training")
Esempio n. 3
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                               overwrite=cfg["training"].get(
                                   "overwrite", False))
    _ = make_logger(model_dir, mode="train")  # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_tasks_list = []
    valid_tasks_list = []
    src_tasks = cfg["data"].get("src")
    trg_tasks = cfg["data"].get("trg")

    for x in range(len(src_tasks)):
        src_lang = src_tasks[x]
        trg_lang = trg_tasks[x]
        train_data, dev_data, _, _, _ = load_data(data_cfg=cfg["data"],
                                                  src_lang=src_lang,
                                                  trg_lang=trg_lang)
        train_tasks_list.append(train_data)
        valid_tasks_list.append(dev_data)

    #build vocabulary

    logger.info("Building vocabulary...")

    src_max_size = cfg["data"].get("src_voc_limit", sys.maxsize)
    src_min_freq = cfg["data"].get("src_voc_min_freq", 1)
    trg_max_size = cfg["data"].get("trg_voc_limit", sys.maxsize)
    trg_min_freq = cfg["data"].get("trg_voc_min_freq", 1)

    src_vocab_file = cfg["data"].get("src_vocab", None)
    trg_vocab_file = cfg["data"].get("trg_vocab", None)

    src_vocab = build_vocab(field="src",
                            min_freq=src_min_freq,
                            max_size=src_max_size,
                            dataset=train_tasks_list[0],
                            vocab_file=src_vocab_file)
    trg_vocab = build_vocab(field="trg",
                            min_freq=trg_min_freq,
                            max_size=trg_max_size,
                            dataset=train_tasks_list[0],
                            vocab_file=trg_vocab_file)

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=src_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg)

    # log_data_info(train_data=train_data,
    #               valid_data=dev_data,
    #               test_data=test_data,
    #               src_vocab=src_vocab,
    #               trg_vocab=trg_vocab)

    logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.maml_train_and_validate(train_tasks=train_tasks_list,
                                    valid_tasks=valid_tasks_list)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter)
    output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter)
    output_path = os.path.join(model_dir, output_name)
Esempio n. 4
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    kb_task = bool(cfg["data"].get("kb_task", False))
    # load the data

    train_data, dev_data, test_data,\
        src_vocab, trg_vocab,\
        train_kb, dev_kb, test_kb,\
        train_kb_lookup, dev_kb_lookup, test_kb_lookup,\
        train_kb_lengths, dev_kb_lengths, test_kb_lengths,\
        train_kb_truvals, dev_kb_truvals, test_kb_truvals,\
        trv_vocab, canonizer,\
        dev_data_canon, test_data_canon\
            = load_data(data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"],
                        src_vocab=src_vocab,
                        trg_vocab=trg_vocab,
                        trv_vocab=trv_vocab,
                        canonizer=canonizer)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    if kb_task:
        trv_vocab_file = "{}/trv_vocab.txt".format(
            cfg["training"]["model_dir"])
        trv_vocab.to_file(trv_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data, kb_task=kb_task,\
        train_kb=train_kb, train_kb_lkp=train_kb_lookup, train_kb_lens=train_kb_lengths, train_kb_truvals=train_kb_truvals,\
        valid_kb=dev_kb, valid_kb_lkp=dev_kb_lookup, valid_kb_lens=dev_kb_lengths, valid_kb_truvals=dev_kb_truvals,\
            valid_data_canon=dev_data_canon)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Esempio n. 5
0
def train(cfg_file):
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file:
    :return:
    """
    cfg = load_config(cfg_file)
    # set the random seed
    # torch.backends.cudnn.deterministic = True
    seed = cfg["training"].get("random_seed", 42)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = \
        load_data(cfg=cfg)

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # print config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)
    model.log_parameters_list(logging_function=trainer.logger.info)

    logging.info(model)

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    if test_data is not None:
        trainer.load_checkpoint("{}/{}.ckpt".format(
            trainer.model_dir, trainer.best_ckpt_iteration))
        # test model
        if "testing" in cfg.keys():
            beam_size = cfg["testing"].get("beam_size", 0)
            beam_alpha = cfg["testing"].get("alpha", -1)
        else:
            beam_size = 0
            beam_alpha = -1

        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores  = validate_on_data(
            data=test_data, batch_size=trainer.batch_size,
            eval_metric=trainer.eval_metric, level=trainer.level,
            max_output_length=trainer.max_output_length,
            model=model, use_cuda=trainer.use_cuda, criterion=None,
            beam_size=beam_size, beam_alpha=beam_alpha)

        if "trg" in test_data.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}"\
                    .format(beam_size, beam_alpha)
            trainer.logger.info("{:4s}: {} {} [{}]".format(
                "Test data result", score, trainer.eval_metric,
                decoding_description))
        else:
            trainer.logger.info(
                "No references given for {}.{} -> no evaluation.".format(
                    cfg["data"]["test"], cfg["data"]["src"]))

        output_path_set = "{}/{}.{}".format(trainer.model_dir, "test",
                                            cfg["data"]["trg"])
        with open(output_path_set, mode="w", encoding="utf-8") as f:
            for h in hypotheses:
                f.write(h + "\n")
        trainer.logger.info(
            "Test translations saved to: {}".format(output_path_set))