Exemple #1
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                   overwrite=cfg["training"].get("overwrite", False))
    _ = make_logger(model_dir, mode="train")    # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg)

    log_data_info(train_data=train_data, valid_data=dev_data,
                  test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab)

    logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter)
    output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter)
    output_path = os.path.join(model_dir, output_name)
    datasets_to_test = {"dev": dev_data, "test": test_data,
                        "src_vocab": src_vocab, "trg_vocab": trg_vocab}
    test(cfg_file, ckpt=ckpt, output_path=output_path,
         datasets=datasets_to_test)
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)
    train_cfg = cfg["training"]
    data_cfg = cfg["data"]

    # set the random seed
    set_seed(seed=train_cfg.get("random_seed", 42))

    # load the data
    data = load_data(data_cfg)
    train_data = data["train_data"]
    dev_data = data["dev_data"]
    test_data = data["test_data"]
    vocabs = data["vocabs"]

    # build an encoder-decoder model
    model = build_model(cfg["model"], vocabs=vocabs)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, join(trainer.model_dir, "config.yaml"))

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(
        train_data=train_data,
        valid_data=dev_data,
        test_data=test_data,
        vocabs=vocabs,
        logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    model_dir = train_cfg["model_dir"]
    for field_name, vocab in vocabs.items():
        vocab_file = join(model_dir, field_name + "_vocab.txt")
        vocab.to_file(vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation (and test, if available)
    ckpt = join(trainer.model_dir, str(trainer.best_ckpt_iteration) + ".ckpt")
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Exemple #3
0
def train_norm(model, cfg_file: str, skip_test: bool = False) -> None:
    """
    Main training function. After training, also test on test data if given.
    :param cfg_file: path to configuration yaml file
    :param skip_test: whether a test should be run or not after training
    """
    cfg = load_config(cfg_file)

    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                               overwrite=cfg["training"].get(
                                   "overwrite", False))
    _ = make_logger(model_dir, mode="train")  # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"],
        src_lang=cfg["data"].get("src"),
        trg_lang=cfg["data"].get("trg"))

    # build an encoder-decoder model
    #model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)
Exemple #4
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Exemple #5
0
def train(cfg_file):
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file:
    :return:
    """
    cfg = load_config(cfg_file)
    # set the random seed
    # torch.backends.cudnn.deterministic = True
    seed = cfg["training"].get("random_seed", 42)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = \
        load_data(cfg=cfg)

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # print config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)
    model.log_parameters_list(logging_function=trainer.logger.info)

    logging.info(model)

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    if test_data is not None:
        trainer.load_checkpoint("{}/{}.ckpt".format(
            trainer.model_dir, trainer.best_ckpt_iteration))
        # test model
        if "testing" in cfg.keys():
            beam_size = cfg["testing"].get("beam_size", 0)
            beam_alpha = cfg["testing"].get("alpha", -1)
        else:
            beam_size = 0
            beam_alpha = -1

        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores = validate_on_data(
                data=test_data, batch_size=trainer.batch_size,
                eval_metric=trainer.eval_metric, level=trainer.level,
                max_output_length=trainer.max_output_length,
                model=model, use_cuda=trainer.use_cuda, criterion=None,
                beam_size=beam_size, beam_alpha=beam_alpha)

        if "trg" in test_data.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}"\
                    .format(beam_size, beam_alpha)
            trainer.logger.info("Test data result: %f %s [%s]", score,
                                trainer.eval_metric, decoding_description)
        else:
            trainer.logger.info(
                "No references given for %s.%s -> no evaluation.",
                cfg["data"]["test"], cfg["data"]["src"])

        output_path_set = "{}/{}.{}".format(trainer.model_dir, "test",
                                            cfg["data"]["trg"])
        with open(output_path_set, mode="w", encoding="utf-8") as f:
            for h in hypotheses:
                f.write(h + "\n")
        trainer.logger.info("Test translations saved to: %s", output_path_set)
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))
    shards_dir = os.path.dirname(cfg["data"]["shard_path"])
    if not os.path.exists(shards_dir):
        os.makedirs(shards_dir)

    if cfg["data"].get("shard_data", False):
        assert cfg["data"].get(
            "n_shards", 0) > 0, "n_shards needs to exist and be at least 1"
        shard_data(path=cfg["data"]["train"],
                   src_lang=cfg["data"]["src"],
                   tgt_lang=cfg["data"]["trg"],
                   n_shards=cfg["data"]["n_shards"],
                   shard_path=cfg["data"]["shard_path"])

    # load the data
    load_train_whole = True if cfg["data"].get("n_shards", 0) < 1 else False
    train_data, dev_data, test_data, src_vocab, trg_vocab, src_field, trg_field = load_data(
        data_cfg=cfg["data"], load_train=load_train_whole)

    if not load_train_whole:
        sharded_iterator = ShardedEpochDatasetIterator(
            n_shards=cfg["data"]["n_shards"],
            percent_to_sample=cfg["data"].get("percent_to_sample_from_shard",
                                              1.0),
            data_path=cfg["data"]["train"],
            shard_path=cfg["data"]["shard_path"],
            extensions=(cfg["data"]["src"], cfg["data"]["trg"]),
            fields=(src_field, trg_field),
            n_epochs=cfg["training"]["epochs"],
            filter_pred=lambda x: len(vars(x)[
                'src']) <= cfg["data"]["max_sent_length"] and len(
                    vars(x)['trg']) <= cfg["data"]["max_sent_length"])
    else:
        sharded_iterator = None

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)
    if load_train_whole:
        log_data_info(train_data=train_data,
                      valid_data=dev_data,
                      test_data=test_data,
                      src_vocab=src_vocab,
                      trg_vocab=trg_vocab,
                      logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data,
                               valid_data=dev_data,
                               sharded_iterator=sharded_iterator)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Exemple #7
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # test the model with the best checkpoint
    if test_data is not None:

        # load checkpoint
        if trainer.best_ckpt_iteration > 0:
            checkpoint_path = "{}/{}.ckpt".format(trainer.model_dir,
                                                  trainer.best_ckpt_iteration)
        else:
            ## For save_checkpoint by save_freq
            checkpoint_path = get_latest_checkpoint(trainer.model_dir)
        try:
            trainer.init_from_checkpoint(checkpoint_path)
        except AssertionError:
            trainer.logger.warning(
                "Checkpoint %s does not exist. "
                "Skipping testing.", checkpoint_path)
            if trainer.best_ckpt_iteration == 0 \
                and trainer.best_ckpt_score in [np.inf, -np.inf]:
                trainer.logger.warning(
                    "It seems like no checkpoint was written, "
                    "since no improvement was obtained over the initial model."
                )
            return

        # generate hypotheses for test data
        if "testing" in cfg.keys():
            beam_size = cfg["testing"].get("beam_size", 0)
            beam_alpha = cfg["testing"].get("alpha", -1)
            return_logp = cfg["testing"].get("return_logp", False)
        else:
            beam_size = 0
            beam_alpha = -1
            return_logp = False

        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores, log_probs = validate_on_data(
                data=test_data, batch_size=trainer.batch_size,
                eval_metric=trainer.eval_metric, level=trainer.level,
                max_output_length=trainer.max_output_length,
                model=model, use_cuda=trainer.use_cuda, loss_function=None,
                beam_size=beam_size, beam_alpha=beam_alpha,
                return_logp=return_logp)

        if "trg" in test_data.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}"\
                    .format(beam_size, beam_alpha)
            trainer.logger.info("Test data result: %f %s [%s]", score,
                                trainer.eval_metric, decoding_description)
        else:
            trainer.logger.info(
                "No references given for %s.%s -> no evaluation.",
                cfg["data"]["test"], cfg["data"]["src"])

        output_path_set = "{}/{}.{}".format(trainer.model_dir, "test",
                                            cfg["data"]["trg"])
        with open(output_path_set, mode="w", encoding="utf-8") as f:
            for h in hypotheses:
                f.write("{}\n".format(h))
        trainer.logger.info("Test translations saved to: %s", output_path_set)

        if return_logp:
            output_path_set_logp = output_path_set + ".logp"
            with open(output_path_set_logp, mode="w", encoding="utf-8") as f:
                for l in log_probs:
                    f.write("{}\n".format(l))
            trainer.logger.info("Test log probs saved to: %s",
                                output_path_set_logp)