Esempio n. 1
0
    def testRandomSubset(self):
        # only a random subset should be selected for training
        current_cfg = self.data_cfg.copy()
        current_cfg["random_train_subset"] = -1

        # load the data
        train_data, dev_data, test_data, src_vocab, trg_vocab = \
            load_data(current_cfg)
        assert len(train_data) == 382

        current_cfg["random_train_subset"] = 10
        train_data, dev_data, test_data, src_vocab, trg_vocab = \
            load_data(current_cfg)
        assert len(train_data) == 10
Esempio n. 2
0
    def testIteratorBatchType(self):

        current_cfg = self.data_cfg.copy()
        current_cfg["level"] = "word"
        current_cfg["lowercase"] = False

        # load toy data
        data = load_data(current_cfg)
        train_data = data["train_data"]
        dev_data = data["dev_data"]
        test_data = data["test_data"]
        vocabs = data["vocabs"]
        src_vocab = vocabs["src"]
        trg_vocab = vocabs["trg"]

        # make batches by number of sentences
        train_iter = iter(make_data_iter(
            train_data, batch_size=10, batch_type="sentence"))
        batch = next(train_iter)

        self.assertEqual(batch.src[0].shape[0], 10)
        self.assertEqual(batch.trg[0].shape[0], 10)

        # make batches by number of tokens
        train_iter = iter(make_data_iter(
            train_data, batch_size=100, batch_type="token"))
        _ = next(train_iter)  # skip a batch
        _ = next(train_iter)  # skip another batch
        batch = next(train_iter)

        self.assertEqual(batch.src[0].shape[0], 8)
        self.assertEqual(np.prod(batch.src[0].shape), 88)
        self.assertLessEqual(np.prod(batch.src[0].shape), 100)
Esempio n. 3
0
    def testIteratorBatchType(self):

        current_cfg = self.data_cfg.copy()

        # load toy data
        train_data, dev_data, test_data, src_vocab, trg_vocab = \
            load_data(current_cfg)

        # make batches by number of sentences
        train_iter = iter(
            make_data_iter(train_data, batch_size=10, batch_type="sentence"))
        batch = next(train_iter)

        self.assertEqual(batch.src[0].shape[0], 10)
        self.assertEqual(batch.trg[0].shape[0], 10)

        # make batches by number of tokens
        train_iter = iter(
            make_data_iter(train_data, batch_size=100, batch_type="token"))
        _ = next(train_iter)  # skip a batch
        _ = next(train_iter)  # skip another batch
        batch = next(train_iter)

        self.assertEqual(batch.src[0].shape[0], 8)
        self.assertEqual(np.prod(batch.src[0].shape), 88)
        self.assertLessEqual(np.prod(batch.src[0].shape), 100)
Esempio n. 4
0
    def setUp(self):
        self.train_path = "test/data/toy/train"
        self.dev_path = "test/data/toy/dev"
        self.test_path = "test/data/toy/test"
        self.levels = ["char", "word"]  # bpe is equivalently processed to word
        self.max_sent_length = 20

        # minimal data config
        self.data_cfg = {
            "src": "de",
            "trg": "en",
            "train": self.train_path,
            "dev": self.dev_path,
            "level": "char",
            "lowercase": True,
            "max_sent_length": self.max_sent_length
        }

        # load the data
        self.train_data, self.dev_data, self.test_data, src_vocab, trg_vocab = \
            load_data(self.data_cfg)
        self.pad_index = trg_vocab.stoi[PAD_TOKEN]
        # random seeds
        seed = 42
        torch.manual_seed(seed)
        random.seed(42)
Esempio n. 5
0
def load_dataset_joey(config):
    print("Creating datasets and vocabularies...")
    data_cfg = {
        "src":
        config["src"],
        "trg":
        config["tgt"],
        "train":
        config["data_dir"] + "/" + config["train_prefix"],
        "dev":
        config["data_dir"] + "/" + config["dev_prefix"],
        "level":
        "bpe",
        "lowercase":
        False,
        "max_sent_length":
        config["max_len"],
        "src_vocab":
        config["data_dir"] + "/" + config["vocab_prefix"] + "." +
        config["src"],
        "trg_vocab":
        config["data_dir"] + "/" + config["vocab_prefix"] + "." + config["tgt"]
    }
    train_data, dev_data, _, src_vocab, tgt_vocab = data.load_data(data_cfg)
    return train_data, dev_data, src_vocab, tgt_vocab
Esempio n. 6
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                   overwrite=cfg["training"].get("overwrite", False))
    _ = make_logger(model_dir, mode="train")    # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg)

    log_data_info(train_data=train_data, valid_data=dev_data,
                  test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab)

    logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter)
    output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter)
    output_path = os.path.join(model_dir, output_name)
    datasets_to_test = {"dev": dev_data, "test": test_data,
                        "src_vocab": src_vocab, "trg_vocab": trg_vocab}
    test(cfg_file, ckpt=ckpt, output_path=output_path,
         datasets=datasets_to_test)
Esempio n. 7
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)
    train_cfg = cfg["training"]
    data_cfg = cfg["data"]

    # set the random seed
    set_seed(seed=train_cfg.get("random_seed", 42))

    # load the data
    data = load_data(data_cfg)
    train_data = data["train_data"]
    dev_data = data["dev_data"]
    test_data = data["test_data"]
    vocabs = data["vocabs"]

    # build an encoder-decoder model
    model = build_model(cfg["model"], vocabs=vocabs)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, join(trainer.model_dir, "config.yaml"))

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(
        train_data=train_data,
        valid_data=dev_data,
        test_data=test_data,
        vocabs=vocabs,
        logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    model_dir = train_cfg["model_dir"]
    for field_name, vocab in vocabs.items():
        vocab_file = join(model_dir, field_name + "_vocab.txt")
        vocab.to_file(vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation (and test, if available)
    ckpt = join(trainer.model_dir, str(trainer.best_ckpt_iteration) + ".ckpt")
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Esempio n. 8
0
    def setUp(self):
        seed = 42
        torch.manual_seed(seed)
        self.cfg = {
            "data": {
                "src": "de",
                "trg": "en",
                "train": "test/data/toy/train",     # needed for vocab
                "test": "test/data/toy/test",
                "level": "word",
                "lowercase": False,
                "max_sent_length": 10
            },
            "testing": {
                "bpe_type": None,
                "beam_size": 5,
                "alpha": 1.0
            },
            "training": {
                "batch_size": 2,
                "batch_type": "sentence",
                "eval_metric": "bleu"
            },
            "model": {
                "tied_embeddings": False,
                "tied_softmax": False,
                "encoder": {
                    "type": "transformer",
                    "hidden_size": 12,
                    "ff_size": 24,
                    "embeddings": {"embedding_dim": 12},
                    "num_layers": 1,
                    "num_heads": 4
                },
                "decoder": {
                    "type": "transformer",
                    "hidden_size": 12,
                    "ff_size": 24,
                    "embeddings": {"embedding_dim": 12},
                    "num_layers": 1,
                    "num_heads": 4
                },
            }
        }

        # load data
        _, _, test_data, src_vocab, trg_vocab = load_data(
            self.cfg["data"], datasets=["train", "test"])
        self.test_data = test_data
        self.parsed_cfg = parse_test_args(self.cfg, mode="translate")

        # build model
        self.model = build_model(self.cfg["model"],
                                 src_vocab=src_vocab, trg_vocab=trg_vocab)
Esempio n. 9
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    print(f'Loading data...')
    # load the data
    train_data, dev_data, _, trg_vocab = load_data(data_cfg=cfg["data"],
                                                   get_test=False)

    print(f'Building model...')
    # build an encoder-decoder model
    model = build_model(cfg["model"], trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    # log_data_info(train_data=train_data, valid_data=dev_data,
    #               test_data=test_data, trg_vocab=trg_vocab,
    #               logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    # trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    # trg_vocab.to_file(trg_vocab_file)

    print(f'Initiating Training...')
    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    return test(cfg_file,
                ckpt=ckpt,
                output_path=output_path,
                logger=trainer.logger,
                trg_vocab=trg_vocab)
Esempio n. 10
0
def train_norm(model, cfg_file: str, skip_test: bool = False) -> None:
    """
    Main training function. After training, also test on test data if given.
    :param cfg_file: path to configuration yaml file
    :param skip_test: whether a test should be run or not after training
    """
    cfg = load_config(cfg_file)

    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                               overwrite=cfg["training"].get(
                                   "overwrite", False))
    _ = make_logger(model_dir, mode="train")  # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"],
        src_lang=cfg["data"].get("src"),
        trg_lang=cfg["data"].get("trg"))

    # build an encoder-decoder model
    #model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)
Esempio n. 11
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False,
         logger: logging.Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = logging.getLogger(__name__)
        FORMAT = '%(asctime)-15s - %(message)s'
        logging.basicConfig(format=FORMAT)
        logger.setLevel(level=logging.DEBUG)

    cfg = load_config(cfg_file)

    if "test" not in cfg["data"].keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = cfg["training"]["batch_size"]
    batch_type = cfg["training"].get("batch_type", "sentence")
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    _, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 0
        beam_alpha = -1

    for data_set_name, data_set in data_to_predict.items():

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores = validate_on_data(
            model, data=data_set, batch_size=batch_size,
            batch_type=batch_type, level=level,
            max_output_length=max_output_length, eval_metric=eval_metric,
            use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
            beam_alpha=beam_alpha)
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}".\
                    format(beam_size, beam_alpha)
            logger.info("%4s %s: %6.2f [%s]", data_set_name, eval_metric,
                        score, decoding_description)
        else:
            logger.info("No references given for %s -> no evaluation.",
                        data_set_name)

        if save_attention:
            if attention_scores:
                attention_name = "{}.{}.att".format(data_set_name, step)
                attention_path = os.path.join(model_dir, attention_name)
                logger.info(
                    "Saving attention plots. This might take a while..")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=[s for s in data_set.src],
                                      indices=range(len(hypotheses)),
                                      output_prefix=attention_path)
                logger.info("Attention plots saved to: %s", attention_path)
            else:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not available "
                               "when using beam search. "
                               "Set beam_size to 0 for greedy decoding.")

        if output_path is not None:
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s", output_path_set)
Esempio n. 12
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    kb_task = bool(cfg["data"].get("kb_task", False))
    # load the data

    train_data, dev_data, test_data,\
        src_vocab, trg_vocab,\
        train_kb, dev_kb, test_kb,\
        train_kb_lookup, dev_kb_lookup, test_kb_lookup,\
        train_kb_lengths, dev_kb_lengths, test_kb_lengths,\
        train_kb_truvals, dev_kb_truvals, test_kb_truvals,\
        trv_vocab, canonizer,\
        dev_data_canon, test_data_canon\
            = load_data(data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"],
                        src_vocab=src_vocab,
                        trg_vocab=trg_vocab,
                        trv_vocab=trv_vocab,
                        canonizer=canonizer)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    if kb_task:
        trv_vocab_file = "{}/trv_vocab.txt".format(
            cfg["training"]["model_dir"])
        trv_vocab.to_file(trv_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data, kb_task=kb_task,\
        train_kb=train_kb, train_kb_lkp=train_kb_lookup, train_kb_lens=train_kb_lengths, train_kb_truvals=train_kb_truvals,\
        valid_kb=dev_kb, valid_kb_lkp=dev_kb_lookup, valid_kb_lens=dev_kb_lengths, valid_kb_truvals=dev_kb_truvals,\
            valid_data_canon=dev_data_canon)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Esempio n. 13
0
    def testDataLoading(self):
        # test all combinations of configuration settings
        for test_path in [None, self.test_path]:
            for level in self.levels:
                for lowercase in [True, False]:
                    current_cfg = self.data_cfg.copy()
                    current_cfg["level"] = level
                    current_cfg["lowercase"] = lowercase
                    if test_path is not None:
                        current_cfg["test"] = test_path

                    # load the data
                    train_data, dev_data, test_data, src_vocab, trg_vocab = \
                        load_data(current_cfg)

                    self.assertIs(type(train_data), TranslationDataset)
                    self.assertIs(type(dev_data), TranslationDataset)
                    if test_path is not None:
                        # test has no target side
                        self.assertIs(type(test_data), MonoDataset)

                    # check the number of examples loaded
                    if level == "char":
                        # training set is filtered to max_sent_length
                        expected_train_len = 5
                    else:
                        expected_train_len = 382
                    expected_testdev_len = 20  # dev and test have the same len
                    self.assertEqual(len(train_data), expected_train_len)
                    self.assertEqual(len(dev_data), expected_testdev_len)
                    if test_path is None:
                        self.assertIsNone(test_data)
                    else:
                        self.assertEqual(len(test_data), expected_testdev_len)

                    # check the segmentation: src and trg attributes are lists
                    self.assertIs(type(train_data.examples[0].src), list)
                    self.assertIs(type(train_data.examples[0].trg), list)
                    self.assertIs(type(dev_data.examples[0].src), list)
                    self.assertIs(type(dev_data.examples[0].trg), list)
                    if test_path is not None:
                        self.assertIs(type(test_data.examples[0].src), list)
                        self.assertFalse(hasattr(test_data.examples[0], "trg"))

                    # check the length filtering of the training examples
                    self.assertFalse(any([len(ex.src) > self.max_sent_length for
                                          ex in train_data.examples]))
                    self.assertFalse(any([len(ex.trg) > self.max_sent_length for
                                          ex in train_data.examples]))

                    # check the lowercasing
                    if lowercase:
                        self.assertTrue(
                            all([" ".join(ex.src).lower() == " ".join(ex.src)
                                 for ex in train_data.examples]))
                        self.assertTrue(
                            all([" ".join(ex.src).lower() == " ".join(ex.src)
                                 for ex in dev_data.examples]))
                        self.assertTrue(
                            all([" ".join(ex.trg).lower() == " ".join(ex.trg)
                                 for ex in train_data.examples]))
                        self.assertTrue(
                            all([" ".join(ex.trg).lower() == " ".join(ex.trg)
                                 for ex in dev_data.examples]))
                        if test_path is not None:
                            self.assertTrue(
                                all([" ".join(ex.src).lower() == " ".join(
                                    ex.src) for ex in test_data.examples]))

                    # check the first example from the training set
                    expected_srcs = {"char": "Danke.",
                                     "word": "David Gallo: Das ist Bill Lange."
                                             " Ich bin Dave Gallo."}
                    expected_trgs = {"char": "Thank you.",
                                     "word": "David Gallo: This is Bill Lange. "
                                             "I'm Dave Gallo."}
                    if level == "char":
                        if lowercase:
                            comparison_src = list(expected_srcs[level].lower())
                            comparison_trg = list(expected_trgs[level].lower())
                        else:
                            comparison_src = list(expected_srcs[level])
                            comparison_trg = list(expected_trgs[level])
                    else:
                        if lowercase:
                            comparison_src = expected_srcs[level].lower().\
                                split()
                            comparison_trg = expected_trgs[level].lower(). \
                                split()
                        else:
                            comparison_src = expected_srcs[level].split()
                            comparison_trg = expected_trgs[level].split()
                    self.assertEqual(train_data.examples[0].src, comparison_src)
                    self.assertEqual(train_data.examples[0].trg, comparison_trg)
Esempio n. 14
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # test the model with the best checkpoint
    if test_data is not None:

        # load checkpoint
        if trainer.best_ckpt_iteration > 0:
            checkpoint_path = "{}/{}.ckpt".format(trainer.model_dir,
                                                  trainer.best_ckpt_iteration)
        else:
            ## For save_checkpoint by save_freq
            checkpoint_path = get_latest_checkpoint(trainer.model_dir)
        try:
            trainer.init_from_checkpoint(checkpoint_path)
        except AssertionError:
            trainer.logger.warning(
                "Checkpoint %s does not exist. "
                "Skipping testing.", checkpoint_path)
            if trainer.best_ckpt_iteration == 0 \
                and trainer.best_ckpt_score in [np.inf, -np.inf]:
                trainer.logger.warning(
                    "It seems like no checkpoint was written, "
                    "since no improvement was obtained over the initial model."
                )
            return

        # generate hypotheses for test data
        if "testing" in cfg.keys():
            beam_size = cfg["testing"].get("beam_size", 0)
            beam_alpha = cfg["testing"].get("alpha", -1)
            return_logp = cfg["testing"].get("return_logp", False)
        else:
            beam_size = 0
            beam_alpha = -1
            return_logp = False

        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores, log_probs = validate_on_data(
                data=test_data, batch_size=trainer.batch_size,
                eval_metric=trainer.eval_metric, level=trainer.level,
                max_output_length=trainer.max_output_length,
                model=model, use_cuda=trainer.use_cuda, loss_function=None,
                beam_size=beam_size, beam_alpha=beam_alpha,
                return_logp=return_logp)

        if "trg" in test_data.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}"\
                    .format(beam_size, beam_alpha)
            trainer.logger.info("Test data result: %f %s [%s]", score,
                                trainer.eval_metric, decoding_description)
        else:
            trainer.logger.info(
                "No references given for %s.%s -> no evaluation.",
                cfg["data"]["test"], cfg["data"]["src"])

        output_path_set = "{}/{}.{}".format(trainer.model_dir, "test",
                                            cfg["data"]["trg"])
        with open(output_path_set, mode="w", encoding="utf-8") as f:
            for h in hypotheses:
                f.write("{}\n".format(h))
        trainer.logger.info("Test translations saved to: %s", output_path_set)

        if return_logp:
            output_path_set_logp = output_path_set + ".logp"
            with open(output_path_set_logp, mode="w", encoding="utf-8") as f:
                for l in log_probs:
                    f.write("{}\n".format(l))
            trainer.logger.info("Test log probs saved to: %s",
                                output_path_set_logp)
Esempio n. 15
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False,
         logger: logging.Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = logging.getLogger(__name__)
        FORMAT = '%(asctime)-15s - %(message)s'
        logging.basicConfig(format=FORMAT)
        logger.setLevel(level=logging.DEBUG)

    cfg = load_config(cfg_file)

    if "test" not in cfg["data"].keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"]["batch_size"])
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    _, dev_data, test_data,\
    src_vocab, trg_vocab,\
    _, dev_kb, test_kb,\
    _, dev_kb_lookup, test_kb_lookup, \
    _, dev_kb_lengths, test_kb_lengths,\
    _, dev_kb_truvals, test_kb_truvals, \
    trv_vocab, canon_fun,\
         dev_data_canon, test_data_canon \
        = load_data(
        data_cfg=cfg["data"]
    )

    report_entf1_on_canonicals = cfg["training"].get(
        "report_entf1_on_canonicals", False)

    kb_task = (test_kb != None)

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"],
                        src_vocab=src_vocab,
                        trg_vocab=trg_vocab,
                        trv_vocab=trv_vocab,
                        canonizer=canon_fun)
    model.load_state_dict(model_checkpoint["model_state"])

    # FIXME for the moment, for testing, try overriding model.canonize with canon_fun from test functions loaded data
    # should hopefully not be an issue with gridsearch results...

    if use_cuda:
        model.cuda()  # move to GPU

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 0
        beam_alpha = -1

    for data_set_name, data_set in data_to_predict.items():

        if data_set_name == "dev":
            kb_info = [
                dev_kb, dev_kb_lookup, dev_kb_lengths, dev_kb_truvals,
                dev_data_canon
            ]
        elif data_set_name == "test":
            kb_info = [
                test_kb, test_kb_lookup, test_kb_lengths, test_kb_truvals,
                test_data_canon
            ]
        else:
            raise ValueError((data_set_name, data_set))

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores, kb_att_scores, ent_f1, ent_mcc = validate_on_data(
            model,
            data=data_set,
            batch_size=batch_size,
            batch_type=batch_type,
            level=level,
            max_output_length=max_output_length,
            eval_metric=eval_metric,
            use_cuda=use_cuda,
            loss_function=None,
            beam_size=beam_size,
            beam_alpha=beam_alpha,
            kb_task = kb_task,
            valid_kb=kb_info[0],
            valid_kb_lkp=kb_info[1],
            valid_kb_lens=kb_info[2],
            valid_kb_truvals=kb_info[3],
            valid_data_canon=kb_info[4],
            report_on_canonicals=report_entf1_on_canonicals
            )
        """
                batch_size=self.eval_batch_size,
                data=valid_data,
                eval_metric=self.eval_metric,
                level=self.level, 
                model=self.model,
                use_cuda=self.use_cuda,
                max_output_length=self.max_output_length,
                loss_function=self.loss,
                beam_size=0,  
                batch_type=self.eval_batch_type,
                kb_task=kb_task,
                valid_kb=valid_kb,
                valid_kb_lkp=valid_kb_lkp,
                valid_kb_lens=valid_kb_lens,
                valid_kb_truvals=valid_kb_truvals
        """
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}".\
                    format(beam_size, beam_alpha)

            logger.info("%4s %s: %6.2f f1: %6.2f mcc: %6.2f [%s]",
                        data_set_name, eval_metric, score, ent_f1, ent_mcc,
                        decoding_description)
        else:
            logger.info("No references given for %s -> no evaluation.",
                        data_set_name)

        if save_attention:
            if attention_scores:
                attention_name = "{}.{}.att".format(data_set_name, step)
                attention_path = os.path.join(model_dir, attention_name)

                logger.info(
                    "Saving attention plots. This might take a while..")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=data_set.src,
                                      indices=range(len(hypotheses)),
                                      output_prefix=attention_path)
                logger.info("Attention plots saved to: %s", attention_path)
            if kb_att_scores:
                kb_att_name = "{}.{}.kbatt".format(data_set_name, step)
                kb_att_path = os.path.join(model_dir, kb_att_name)
                store_attention_plots(
                    attentions=kb_att_scores,
                    targets=hypotheses_raw,
                    sources=list(data_set.kbsrc),  #TODO
                    indices=range(len(hypotheses)),
                    output_prefix=kb_att_path,
                    kb_info=(dev_kb_lookup, dev_kb_lengths,
                             list(data_set.kbtrg)))
                logger.info("KB Attention plots saved to: %s", attention_path)

            else:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not available "
                               "when using beam search. "
                               "Set beam_size to 0 for greedy decoding.")

        if output_path is not None:
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s", output_path_set)
Esempio n. 16
0
def test(cfg_file,
         ckpt,  # str or list now
         output_path: str = None,
         save_attention: bool = False,
         logger: logging.Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = logging.getLogger(__name__)
        FORMAT = '%(asctime)-15s - %(message)s'
        logging.basicConfig(format=FORMAT)
        logger.setLevel(level=logging.DEBUG)

    cfg = load_config(cfg_file)
    train_cfg = cfg["training"]
    data_cfg = cfg["data"]
    test_cfg = cfg["testing"]

    if "test" not in data_cfg.keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = train_cfg["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint found in directory {}."
                                    .format(model_dir))
        try:
            step = ckpt.split(model_dir+"/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = train_cfg.get("eval_batch_size", train_cfg["batch_size"])
    batch_type = train_cfg.get("eval_batch_type", train_cfg.get("batch_type", "sentence"))
    use_cuda = train_cfg.get("use_cuda", False)
    src_level = data_cfg.get("src_level", data_cfg.get("level", "word"))
    trg_level = data_cfg.get("trg_level", data_cfg.get("level", "word"))

    eval_metric = train_cfg["eval_metric"]
    if isinstance(eval_metric, str):
        eval_metric = [eval_metric]
    attn_metric = train_cfg.get("attn_metric", [])
    if isinstance(attn_metric, str):
        attn_metric = [attn_metric]
    max_output_length = train_cfg.get("max_output_length", None)

    # load the data
    data = load_data(data_cfg)
    dev_data = data["dev_data"]
    test_data = data["test_data"]
    vocabs = data["vocabs"]

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    if isinstance(ckpt, str):
        ckpt = [ckpt]
    individual_models = []
    for c in ckpt:
        model_checkpoint = load_checkpoint(c, use_cuda=use_cuda)

        # build model and load parameters into it
        m = build_model(cfg["model"], vocabs=vocabs)
        m.load_state_dict(model_checkpoint["model_state"])
        individual_models.append(m)
    if len(individual_models) == 1:
        model = individual_models[0]
    else:
        model = EnsembleModel(*individual_models)

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_sizes = test_cfg.get("beam_size", 0)
        beam_alpha = test_cfg.get("alpha", 0)
    else:
        beam_sizes = 0
        beam_alpha = 0
    if isinstance(beam_sizes, int):
        beam_sizes = [beam_sizes]
    assert beam_alpha >= 0, "Use alpha >= 0"

    for beam_size in beam_sizes:
        for data_set_name, data_set in data_to_predict.items():

            #pylint: disable=unused-variable
            scores, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores, scores_by_lang, by_lang = validate_on_data(
                model, data=data_set, batch_size=batch_size,
                batch_type=batch_type,
                src_level=src_level, trg_level=trg_level,
                max_output_length=max_output_length, eval_metrics=eval_metric,
                attn_metrics=attn_metric,
                use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
                beam_alpha=beam_alpha, save_attention=save_attention)
            #pylint: enable=unused-variable

            if "trg" in data_set.fields:
                labeled_scores = sorted(scores.items())
                eval_report = ", ".join("{}: {:.5f}".format(n, v)
                                        for n, v in labeled_scores)
                decoding_description = "Greedy decoding" if beam_size == 0 else \
                    "Beam search decoding with beam size = {} and alpha = {}".\
                        format(beam_size, beam_alpha)
                logger.info("%4s %s: [%s]",
                            data_set_name, eval_report, decoding_description)
                if scores_by_lang is not None:
                    for metric, scores in scores_by_lang.items():
                        # make a report
                        lang_report = [metric]
                        numbers = sorted(scores.items())
                        lang_report.extend(["{}: {:.5f}".format(k, v)
                                            for k, v in numbers])

                        logger.info("\n\t".join(lang_report))
            else:
                logger.info("No references given for %s -> no evaluation.",
                            data_set_name)

            if save_attention:
                # currently this will break for transformers
                if attention_scores:
                    #attention_name = "{}.{}.att".format(data_set_name, step)
                    #attention_path = os.path.join(model_dir, attention_name)
                    logger.info("Saving attention plots. This might take a while..")
                    store_attention_plots(attentions=attention_scores,
                                          targets=hypotheses_raw,
                                          sources=[s for s in data_set.src],
                                          indices=range(len(hypotheses)),
                                          model_dir=model_dir,
                                          steps=step,
                                          data_set_name=data_set_name)
                    logger.info("Attention plots saved to: %s", model_dir)
                else:
                    logger.warning("Attention scores could not be saved. "
                                   "Note that attention scores are not available "
                                   "when using beam search. "
                                   "Set beam_size to 0 for greedy decoding.")

            if output_path is not None:
                for lang, ref_and_hyp in by_lang.items():
                    if lang is None:
                        # monolingual case
                        output_path_set = "{}.{}".format(output_path, data_set_name)
                    else:
                        output_path_set = "{}.{}.{}".format(output_path, lang, data_set_name)
                    if isinstance(ref_and_hyp[0], str):
                        hyps = ref_and_hyp
                    else:
                        hyps = [hyp for (ref, hyp) in ref_and_hyp]
                    with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                        for hyp in hyps:
                            out_file.write(hyp + "\n")
                    logger.info("Translations saved to: %s", output_path_set)
Esempio n. 17
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False,
         logger: Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = make_logger()

    cfg = load_config(cfg_file)

    # when checkpoint is not specified, take latest (best) from model dir
    step = "best"
    model_dir = cfg["training"]["model_dir"]
    if ckpt is None:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    architecture = cfg["model"].get("architecture", "encoder-decoder")
    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"]["batch_size"])
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # original encoder-decoder testing
    if architecture == "encoder-decoder":
        if "test" not in cfg["data"].keys():
            raise ValueError("Test data must be specified in config.")
        # load the data
        _, dev_data, test_data, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])
        data_to_predict = {"dev": dev_data, "test": test_data}

        # load model state from disk
        model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

        # build model and load parameters into it
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)
        model.load_state_dict(model_checkpoint["model_state"])

        if use_cuda:
            model.cuda()

        # whether to use beam search for decoding, 0: greedy decoding
        if "testing" in cfg.keys():
            beam_size = cfg["testing"].get("beam_size", 1)
            beam_alpha = cfg["testing"].get("alpha", -1)
            postprocess = cfg["testing"].get("postprocess", True)
        else:
            beam_size = 1
            beam_alpha = -1
            postprocess = True

        for data_set_name, data_set in data_to_predict.items():

            # pylint: disable=unused-variable
            score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores = validate_on_data(
                model, data=data_set, batch_size=batch_size,
                batch_type=batch_type, level=level,
                max_output_length=max_output_length, eval_metric=eval_metric,
                use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
                beam_alpha=beam_alpha, logger=logger, postprocess=postprocess)
            # pylint: enable=unused-variable

            if "trg" in data_set.fields:
                decoding_description = "Greedy decoding" if beam_size < 2 else \
                    "Beam search decoding with beam size = {} and alpha = {}". \
                        format(beam_size, beam_alpha)
                logger.info("%4s %s: %6.2f [%s]", data_set_name, eval_metric,
                            score, decoding_description)
            else:
                logger.info("No references given for %s -> no evaluation.",
                            data_set_name)

            if save_attention:
                if attention_scores:
                    attention_name = "{}.{}.att".format(data_set_name, step)
                    attention_path = os.path.join(model_dir, attention_name)
                    logger.info(
                        "Saving attention plots. This might take a while..")
                    store_attention_plots(attentions=attention_scores,
                                          targets=hypotheses_raw,
                                          sources=data_set.src,
                                          indices=range(len(hypotheses)),
                                          output_prefix=attention_path)
                    logger.info("Attention plots saved to: %s", attention_path)
                else:
                    logger.warning(
                        "Attention scores could not be saved. "
                        "Note that attention scores are not available "
                        "when using beam search. "
                        "Set beam_size to 1 for greedy decoding.")

            if output_path is not None:
                output_path_set = "{}.{}".format(output_path, data_set_name)
                with open(output_path_set, mode="w",
                          encoding="utf-8") as out_file:
                    for hyp in hypotheses:
                        out_file.write(hyp + "\n")
                logger.info("Translations saved to: %s", output_path_set)
    else:
        # unsupervised NMT testing
        if "src2trg_test" not in cfg["data"].keys(
        ) or "trg2src_test" not in cfg["data"].keys():
            raise ValueError("Test data must be specified in config.")
        # load the data
        _, _, _, _, dev_src2trg, dev_trg2src, test_src2trg, test_trg2src, src_vocab, trg_vocab, _ = \
            load_unsupervised_data(data_cfg=cfg["data"])
        data_to_predict = {
            "src2trg": {
                "dev_src2trg": dev_src2trg,
                "test_src2trg": test_src2trg
            },
            "trg2src": {
                "dev_trg2src": dev_trg2src,
                "test_trg2src": test_trg2src
            }
        }

        # load model state from disk
        model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

        # build model and load parameters into it
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)
        assert isinstance(model, UnsupervisedNMTModel)
        model.src2src_translator.load_state_dict(
            model_checkpoint["src2src_model_state"])
        model.trg2trg_translator.load_state_dict(
            model_checkpoint["trg2trg_model_state"])
        model.src2trg_translator.load_state_dict(
            model_checkpoint["src2trg_model_state"])
        model.trg2src_translator.load_state_dict(
            model_checkpoint["trg2src_model_state"])

        if use_cuda:
            model.src2trg_translator.cuda()
            model.trg2trg_translator.cuda()
            model.src2trg_translator.cuda()
            model.trg2src_translator.cuda()

        # whether to use beam search for decoding, 0: greedy decoding
        if "testing" in cfg.keys():
            beam_size = cfg["testing"].get("beam_size", 1)
            beam_alpha = cfg["testing"].get("alpha", -1)
            postprocess = cfg["testing"].get("postprocess", True)
        else:
            beam_size = 1
            beam_alpha = -1
            postprocess = True

        for translation_direction, dataset_dict in data_to_predict.items():
            # choose correct translator
            if translation_direction == "src2trg":
                model_to_use = model.src2trg_translator
            else:
                model_to_use = model.trg2src_translator

            for dataset_name, dataset in dataset_dict.items():
                score, loss, ppl, sources, sources_raw, references, hypotheses, \
                hypotheses_raw, attention_scores = validate_on_data(
                    model_to_use, data=dataset, batch_size=batch_size,
                    batch_type=batch_type, level=level,
                    max_output_length=max_output_length, eval_metric=eval_metric,
                    use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
                    beam_alpha=beam_alpha, logger=logger, postprocess=postprocess)

                if "trg" in dataset.fields:
                    decoding_description = "Greedy decoding" if beam_size < 2 else \
                        "Beam search decoding with beam size = {} and alpha = {}". \
                            format(beam_size, beam_alpha)
                    logger.info("%4s %s: %6.2f [%s]", dataset_name,
                                eval_metric, score, decoding_description)
                else:
                    logger.info("No references given for %s -> no evaluation.",
                                dataset_name)

                if save_attention:
                    if attention_scores:
                        attention_name = "{}.{}.att".format(dataset_name, step)
                        attention_path = os.path.join(model_dir,
                                                      attention_name)
                        logger.info(
                            "Saving attention plots. This might take a while.."
                        )
                        store_attention_plots(attentions=attention_scores,
                                              targets=hypotheses_raw,
                                              sources=dataset.src,
                                              indices=list(
                                                  range(len(hypotheses))),
                                              output_prefix=attention_path)
                        logger.info("Attention plots saved to: %s",
                                    attention_path)
                    else:
                        logger.warning(
                            "Attention scores could not be saved. "
                            "Note that attention scores are not available "
                            "when using beam search. "
                            "Set beam_size to 1 for greedy decoding.")

                if output_path is not None:
                    output_path_set = "{}.{}".format(output_path, dataset_name)
                    with open(output_path_set, mode="w",
                              encoding="utf-8") as out_file:
                        for hyp in hypotheses:
                            out_file.write(hyp + "\n")
                    logger.info("Translations saved to: %s", output_path_set)
Esempio n. 18
0
def Q_learning(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.
    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)  # config is a dict
    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                               overwrite=cfg["training"].get(
                                   "overwrite", False))
    _ = make_logger(model_dir, mode="train")  # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    print("loadding data here")
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])
    # The training data is filtered to include sentences up to `max_sent_length`
    #     on source and target side.

    # training config:
    train_config = cfg["training"]
    shuffle = train_config.get("shuffle", True)
    batch_size = train_config["batch_size"]
    mini_BATCH_SIZE = train_config["mini_batch_size"]
    batch_type = train_config.get("batch_type", "sentence")
    outer_epochs = train_config.get("outer_epochs", 10)
    inner_epochs = train_config.get("inner_epochs", 10)
    TARGET_UPDATE = train_config.get("target_update", 10)
    Gamma = train_config.get("Gamma", 0.999)
    use_cuda = train_config["use_cuda"] and torch.cuda.is_available()

    # validation part config
    # validation
    validation_freq = train_config.get("validation_freq", 1000)
    ckpt_queue = queue.Queue(maxsize=train_config.get("keep_last_ckpts", 5))
    eval_batch_size = train_config.get("eval_batch_size", batch_size)
    level = cfg["data"]["level"]

    eval_metric = train_config.get("eval_metric", "bleu")
    n_gpu = torch.cuda.device_count() if use_cuda else 0
    eval_batch_type = train_config.get("eval_batch_type", batch_type)
    # eval options
    test_config = cfg["testing"]
    bpe_type = test_config.get("bpe_type", "subword-nmt")
    sacrebleu = {"remove_whitespace": True, "tokenize": "13a"}
    max_output_length = train_config.get("max_output_length", None)
    minimize_metric = True
    # initialize training statistics
    stats = TrainStatistics(
        steps=0,
        stop=False,
        total_tokens=0,
        best_ckpt_iter=0,
        best_ckpt_score=np.inf if minimize_metric else -np.inf,
        minimize_metric=minimize_metric)

    early_stopping_metric = train_config.get("early_stopping_metric",
                                             "eval_metric")

    if early_stopping_metric in ["ppl", "loss"]:
        stats.minimize_metric = True
        stats.best_ckpt_score = np.inf
    elif early_stopping_metric == "eval_metric":
        if eval_metric in [
                "bleu", "chrf", "token_accuracy", "sequence_accuracy"
        ]:
            stats.minimize_metric = False
            stats.best_ckpt_score = -np.inf

        # eval metric that has to get minimized (not yet implemented)
        else:
            stats.minimize_metric = True

    # data loader(modified from train_and_validate function
    # Returns a torchtext iterator for a torchtext dataset.
    # param dataset: torchtext dataset containing src and optionally trg
    train_iter = make_data_iter(train_data,
                                batch_size=batch_size,
                                batch_type=batch_type,
                                train=True,
                                shuffle=shuffle)

    # initialize the Replay Memory D with capacity N
    memory = ReplayMemory(10000)
    steps_done = 0

    # initialize two DQN networks
    policy_net = build_model(cfg["model"],
                             src_vocab=src_vocab,
                             trg_vocab=trg_vocab)  # Q_network
    target_net = build_model(cfg["model"],
                             src_vocab=src_vocab,
                             trg_vocab=trg_vocab)  # Q_hat_network
    #logger.info(policy_net.src_vocab.stoi)
    #print("###############trg vocab: ", len(target_net.trg_vocab.stoi))
    #print("trg embed: ", target_net.trg_embed.vocab_size)
    if use_cuda:
        policy_net.cuda()
        target_net.cuda()

    target_net.load_state_dict(policy_net.state_dict())
    # Initialize target net Q_hat with weights equal to policy_net

    target_net.eval()  # target_net not update the parameters, test mode

    # Optimizer
    optimizer = build_optimizer(config=cfg["training"],
                                parameters=policy_net.parameters())
    # Loss function
    mse_loss = torch.nn.MSELoss()

    pad_index = policy_net.pad_index
    # print('!!!'*10, pad_index)

    cross_entropy_loss = XentLoss(pad_index=pad_index)
    policy_net.loss_function = cross_entropy_loss

    # learning rate scheduling
    scheduler, scheduler_step_at = build_scheduler(
        config=train_config,
        scheduler_mode="min" if minimize_metric else "max",
        optimizer=optimizer,
        hidden_size=cfg["model"]["encoder"]["hidden_size"])

    # model parameters
    if "load_model" in train_config.keys():
        load_model_path = train_config["load_model"]
        reset_best_ckpt = train_config.get("reset_best_ckpt", False)
        reset_scheduler = train_config.get("reset_scheduler", False)
        reset_optimizer = train_config.get("reset_optimizer", False)
        reset_iter_state = train_config.get("reset_iter_state", False)

        print('settings', reset_best_ckpt, reset_iter_state, reset_optimizer,
              reset_scheduler)

        logger.info("Loading model from %s", load_model_path)
        model_checkpoint = load_checkpoint(path=load_model_path,
                                           use_cuda=use_cuda)

        # restore model and optimizer parameters
        policy_net.load_state_dict(model_checkpoint["model_state"])

        if not reset_optimizer:
            optimizer.load_state_dict(model_checkpoint["optimizer_state"])
        else:
            logger.info("Reset optimizer.")
        if not reset_scheduler:
            if model_checkpoint["scheduler_state"] is not None and \
                    scheduler is not None:
                scheduler.load_state_dict(model_checkpoint["scheduler_state"])
        else:
            logger.info("Reset scheduler.")

        if not reset_best_ckpt:
            stats.best_ckpt_score = model_checkpoint["best_ckpt_score"]
            stats.best_ckpt_iter = model_checkpoint["best_ckpt_iteration"]
            print('stats.best_ckpt_score', stats.best_ckpt_score)
            print('stats.best_ckpt_iter', stats.best_ckpt_iter)
        else:
            logger.info("Reset tracking of the best checkpoint.")

        if (not reset_iter_state and model_checkpoint.get(
                'train_iter_state', None) is not None):
            train_iter_state = model_checkpoint["train_iter_state"]

        # move parameters to cuda

        target_net.load_state_dict(policy_net.state_dict())
        # Initialize target net Q_hat with weights equal to policy_net

        target_net.eval()

        if use_cuda:
            policy_net.cuda()
            target_net.cuda()

    for i_episode in range(outer_epochs):
        # Outer loop

        # get batch
        for i, batch in enumerate(iter(train_iter)):  # joeynmt training.py 377

            # create a Batch object from torchtext batch
            # ( use class Batch from batch.py)
            # return the sentences same length (with padding) in one batch
            batch = Batch(batch, policy_net.pad_index, use_cuda=use_cuda)
            # we want to get batch.src and batch.trg
            # the shape of batch.src: (batch_size * length of the sentence)

            # source here is represented by the word index not word embedding.

            encoder_output_batch, _, _, _ = policy_net(
                return_type="encode",
                src=batch.src,
                src_length=batch.src_length,
                src_mask=batch.src_mask,
            )

            trans_output_batch, _ = transformer_greedy(
                src_mask=batch.src_mask,
                max_output_length=max_output_length,
                model=policy_net,
                encoder_output=encoder_output_batch,
                steps_done=steps_done,
                use_cuda=use_cuda)
            #print('steps_done',steps_done)

            steps_done += 1

            #print('trans_output_batch.shape is:', trans_output_batch.shape)
            # batch_size * max_translation_sentence_length
            #print('batch.src', batch.src)
            #print('batch.trg', batch.trg)
            print('batch.trg.shape is:', batch.trg.shape)
            print('trans_output_batch', trans_output_batch)

            reward_batch = [
            ]  # Get the reward_batch (Get the bleu score of the sentences in a batch)

            for i in range(int(batch.src.shape[0])):
                all_outputs = [(trans_output_batch[i])[1:]]
                all_ref = [batch.trg[i]]
                sentence_score = calculate_bleu(model=policy_net,
                                                level=level,
                                                raw_hypo=all_outputs,
                                                raw_ref=all_ref)
                reward_batch.append(sentence_score)

            print('reward batch is', reward_batch)
            reward_batch = torch.tensor(reward_batch, dtype=torch.float)

            # reward_batch = bleu(hypotheses, references, tokenize="13a")
            # print('reward_batch.shape', reward_batch.shape)

            # make prefix and push tuples into memory
            push_sample_to_memory(model=policy_net,
                                  level=level,
                                  eos_index=policy_net.eos_index,
                                  memory=memory,
                                  src_batch=batch.src,
                                  trg_batch=batch.trg,
                                  trans_output_batch=trans_output_batch,
                                  reward_batch=reward_batch,
                                  max_output_length=max_output_length)
            print(memory.capacity, len(memory.memory))

            if len(memory.memory) == memory.capacity:
                # inner loop
                for t in range(inner_epochs):
                    # Sample mini-batch from the memory
                    transitions = memory.sample(mini_BATCH_SIZE)
                    # transition = [Transition(source=array([]), prefix=array([]), next_word= int, reward= int),
                    #               Transition(source=array([]), prefix=array([]), next_word= int, reward= int,...]
                    # Each Transition is what we push into memory for one sentence: memory.push(source, prefix, next_word, reward_batch[i])
                    mini_batch = Transition(*zip(*transitions))
                    # merge the same class in transition together
                    # mini_batch = Transition(source=(array([]), array([]),...), prefix=(array([],...),
                    #               next_word=array([...]), reward=array([...]))
                    # mini_batch.reward is tuple: length is mini_BATCH_SIZE.
                    #print('mini_batch', mini_batch)

                    #concatenate together into a tensor.
                    words = []
                    for word in mini_batch.next_word:
                        new_word = word.unsqueeze(0)
                        words.append(new_word)
                    mini_next_word = torch.cat(
                        words)  # shape (mini_BATCH_SIZE,)
                    mini_reward = torch.tensor(
                        mini_batch.reward)  # shape (mini_BATCH_SIZE,)

                    #print('mini_batch.finish', mini_batch.finish)

                    mini_is_eos = torch.Tensor(mini_batch.finish)
                    #print(mini_is_eos)

                    mini_src_length = [
                        len(item) for item in mini_batch.source_sentence
                    ]
                    mini_src_length = torch.Tensor(mini_src_length)

                    mini_src = pad_sequence(mini_batch.source_sentence,
                                            batch_first=True,
                                            padding_value=float(pad_index))
                    # shape (mini_BATCH_SIZE, max_length_src)

                    length_prefix = [len(item) for item in mini_batch.prefix]
                    mini_prefix_length = torch.Tensor(length_prefix)

                    prefix_list = []
                    for prefix_ in mini_batch.prefix:
                        prefix_ = torch.from_numpy(prefix_)
                        prefix_list.append(prefix_)

                    mini_prefix = pad_sequence(prefix_list,
                                               batch_first=True,
                                               padding_value=pad_index)
                    # shape (mini_BATCH_SIZE, max_length_prefix)

                    mini_src_mask = (mini_src != pad_index).unsqueeze(1)
                    mini_trg_mask = (mini_prefix != pad_index).unsqueeze(1)

                    #print('mini_src',  mini_src)
                    #print('mini_src_length', mini_src_length)
                    #print('mini_src_mask', mini_src_mask)
                    #print('mini_prefix', mini_prefix)
                    #print('mini_trg_mask', mini_trg_mask)

                    #print('mini_reward', mini_reward)

                    # max_length_src = torch.max(mini_src_length) #max([len(item) for item in mini_batch.source_sentence])

                    if use_cuda:
                        mini_src = mini_src.cuda()
                        mini_prefix = mini_prefix.cuda()
                        mini_src_mask = mini_src_mask.cuda()
                        mini_src_length = mini_src_length.cuda()
                        mini_trg_mask = mini_trg_mask.cuda()
                        mini_next_word = mini_next_word.cuda()

                    # print(next(policy_net.parameters()).is_cuda)
                    # print(mini_trg_mask.get_device())
                    # calculate the Q_value
                    logits_Q, _, _, _ = policy_net._encode_decode(
                        src=mini_src,
                        trg_input=mini_prefix,
                        src_mask=mini_src_mask,
                        src_length=mini_src_length,
                        trg_mask=
                        mini_trg_mask  # trg_mask = (self.trg_input != pad_index).unsqueeze(1)
                    )
                    #print('mini_prefix_length', mini_prefix_length)

                    #print('logits_Q.shape', logits_Q.shape) # torch.Size([64, 99, 31716])
                    #print('logits_Q', logits_Q)

                    # length_prefix = max([len(item) for item in mini_batch.prefix])
                    # logits_Q shape: batch_size * length of the sentence * total number of words in corpus.
                    logits_Q = logits_Q[range(mini_BATCH_SIZE),
                                        mini_prefix_length.long() - 1, :]
                    #print('logits_Q_.shape', logits_Q.shape) #shape(mini_batch_size, num_words)
                    # logits shape: mini_batch_size * total number of words in corpus
                    Q_value = logits_Q[range(mini_BATCH_SIZE), mini_next_word]
                    #print('mini_next_word', mini_next_word)
                    #print("Q_value", Q_value)

                    mini_prefix_add = torch.cat(
                        [mini_prefix, mini_next_word.unsqueeze(1)], dim=1)
                    #print('mini_prefix_add', mini_prefix_add)
                    mini_trg_mask_add = (mini_prefix_add !=
                                         pad_index).unsqueeze(1)
                    #print('mini_trg_mask_add', mini_trg_mask_add)

                    if use_cuda:
                        mini_prefix_add = mini_prefix_add.cuda()
                        mini_trg_mask_add = mini_trg_mask_add.cuda()

                    logits_Q_hat, _, _, _ = target_net._encode_decode(
                        src=mini_src,
                        trg_input=mini_prefix_add,
                        src_mask=mini_src_mask,
                        src_length=mini_src_length,
                        trg_mask=mini_trg_mask_add)
                    #print('mini_prefix_add.shape', mini_prefix_add.shape)
                    #print('logits_Q_hat.shape', logits_Q_hat.shape)
                    #print('mini_prefix_length.long()', mini_prefix_length.long())
                    logits_Q_hat = logits_Q_hat[range(mini_BATCH_SIZE),
                                                mini_prefix_length.long(), :]
                    Q_hat_value, _ = torch.max(logits_Q_hat, dim=1)
                    #print('Q_hat_value', Q_hat_value)

                    if use_cuda:

                        Q_hat_value = Q_hat_value.cuda()
                        mini_reward = mini_reward.cuda()
                        mini_is_eos = mini_is_eos.cuda()

                    yj = mini_reward.float() + Gamma * Q_hat_value
                    #print('yj', yj)
                    index = mini_is_eos.long()
                    #print('mini_is_eos', mini_is_eos)
                    yj[index] = mini_reward[index]
                    #print('yj', yj)
                    #print('Q_value1', Q_value)

                    yj.detach()
                    # Optimize the model
                    policy_net.zero_grad()

                    # Compute loss
                    loss = mse_loss(yj, Q_value)
                    print('loss', loss)
                    logger.info("step = {}, loss = {}".format(
                        stats.steps, loss.item()))
                    loss.backward()
                    #for param in policy_net.parameters():
                    #   param.grad.data.clamp_(-1, 1)
                    optimizer.step()

                    stats.steps += 1
                    #print('step', stats.steps)

                    if stats.steps % TARGET_UPDATE == 0:
                        #print('update the parameters in target_net.')
                        target_net.load_state_dict(policy_net.state_dict())

                    if stats.steps % validation_freq == 0:  # Validation
                        print('Start validation')

                        valid_score, valid_loss, valid_ppl, valid_sources, \
                        valid_sources_raw, valid_references, valid_hypotheses, \
                        valid_hypotheses_raw, valid_attention_scores = \
                            validate_on_data(
                                model=policy_net,
                                data=dev_data,
                                batch_size=eval_batch_size,
                                use_cuda=use_cuda,
                                level=level,
                                eval_metric=eval_metric,
                                n_gpu=n_gpu,
                                compute_loss=True,
                                beam_size=1,
                                beam_alpha=-1,
                                batch_type=eval_batch_type,
                                postprocess=True,
                                bpe_type=bpe_type,
                                sacrebleu=sacrebleu,
                                max_output_length=max_output_length
                            )
                        print(
                            'validation_loss: {}, validation_score: {}'.format(
                                valid_loss, valid_score))
                        logger.info(valid_loss)
                        print('average loss: total_loss/n_tokens:', valid_ppl)

                        if early_stopping_metric == "loss":
                            ckpt_score = valid_loss
                        elif early_stopping_metric in ["ppl", "perplexity"]:
                            ckpt_score = valid_ppl
                        else:
                            ckpt_score = valid_score
                        if stats.is_best(ckpt_score):
                            stats.best_ckpt_score = ckpt_score
                            stats.best_ckpt_iter = stats.steps
                            logger.info(
                                'Hooray! New best validation result [%s]!',
                                early_stopping_metric)
                            if ckpt_queue.maxsize > 0:
                                logger.info("Saving new checkpoint.")

                                # def _save_checkpoint(self) -> None:
                                """
                                Save the model's current parameters and the training state to a
                                checkpoint.
                                The training state contains the total number of training steps,
                                the total number of training tokens,
                                the best checkpoint score and iteration so far,
                                and optimizer and scheduler states.
                                """
                                model_path = "{}/{}.ckpt".format(
                                    model_dir, stats.steps)
                                model_state_dict = policy_net.module.state_dict() \
                                    if isinstance(policy_net, torch.nn.DataParallel) \
                                    else policy_net.state_dict()
                                state = {
                                    "steps": stats.steps,
                                    "total_tokens": stats.total_tokens,
                                    "best_ckpt_score": stats.best_ckpt_score,
                                    "best_ckpt_iteration":
                                    stats.best_ckpt_iter,
                                    "model_state": model_state_dict,
                                    "optimizer_state": optimizer.state_dict(),
                                    # "scheduler_state": scheduler.state_dict() if
                                    # self.scheduler is not None else None,
                                    # 'amp_state': amp.state_dict() if self.fp16 else None
                                }
                                torch.save(state, model_path)
                                if ckpt_queue.full():
                                    to_delete = ckpt_queue.get(
                                    )  # delete oldest ckpt
                                    try:
                                        os.remove(to_delete)
                                    except FileNotFoundError:
                                        logger.warning(
                                            "Wanted to delete old checkpoint %s but "
                                            "file does not exist.", to_delete)

                                ckpt_queue.put(model_path)

                                best_path = "{}/best.ckpt".format(model_dir)
                                try:
                                    # create/modify symbolic link for best checkpoint
                                    symlink_update(
                                        "{}.ckpt".format(stats.steps),
                                        best_path)
                                except OSError:
                                    # overwrite best.ckpt
                                    torch.save(state, best_path)
Esempio n. 19
0
def test(cfg_file,
         ckpt,
         output_path: str = None,
         save_attention: bool = False,
         logger: logging.Logger = None,
         data_to_test: str = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = logging.getLogger(__name__)
        FORMAT = '%(asctime)-15s - %(message)s'
        logging.basicConfig(format=FORMAT)
        logger.setLevel(level=logging.DEBUG)

    cfg = load_config(cfg_file)
    train_cfg = cfg["training"]
    data_cfg = cfg["data"]
    test_cfg = cfg["testing"]

    if "test" not in data_cfg.keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    model_dir = train_cfg["model_dir"]
    if ckpt is None:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint at {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = train_cfg.get("eval_batch_size", train_cfg["batch_size"])
    batch_type = train_cfg.get("batch_type", "sentence")
    use_cuda = train_cfg.get("use_cuda", False)
    assert "level" in data_cfg or "trg_level" in data_cfg
    trg_level = data_cfg.get("level", data_cfg["trg_level"])

    eval_metric = train_cfg["eval_metric"]
    if isinstance(eval_metric, str):
        eval_metric = [eval_metric]
    max_output_length = test_cfg.get("max_output_length",
                                     train_cfg.get("max_output_length", None))

    # load the data
    data = load_data(data_cfg)
    dev_data = data["dev_data"]
    test_data = data["test_data"]
    vocabs = data["vocabs"]

    data_to_predict = {"dev": dev_data, "test": test_data}
    if data_to_test is not None:
        assert data_to_test in data_to_predict
        data_to_predict = {data_to_test: data_to_predict[data_to_test]}

    # load model state from disk
    if isinstance(ckpt, str):
        ckpt = [ckpt]
    models = []
    for c in ckpt:
        model_checkpoint = load_checkpoint(c, use_cuda=use_cuda)

        # build model and load parameters into it
        m = build_model(cfg["model"], vocabs=vocabs)
        m.load_state_dict(model_checkpoint["model_state"])
        models.append(m)
    model = models[0] if len(models) == 1 else EnsembleModel(*models)

    if use_cuda:
        model.cuda()  # should this exist?

    # whether to use beam search for decoding, 0: greedy decoding
    beam_sizes = beam_alpha = 0
    if "testing" in cfg.keys():
        beam_sizes = test_cfg.get("beam_size", 0)
        beam_alpha = test_cfg.get("alpha", 0)
    beam_sizes = [beam_sizes] if isinstance(beam_sizes, int) else beam_sizes
    assert beam_alpha >= 0, "Use alpha >= 0"

    method = test_cfg.get("method", None)
    max_hyps = test_cfg.get("max_hyps", 1)  # only for the enumerate thing

    validate_by_label = test_cfg.get("validate_by_label",
                                     train_cfg.get("validate_by_label", False))
    forced_sparsity = test_cfg.get("forced_sparsity",
                                   train_cfg.get("forced_sparsity", False))

    for beam_size in beam_sizes:
        for data_set_name, data_set in data_to_predict.items():
            valid_results = validate_on_data(
                model,
                data=data_set,
                batch_size=batch_size,
                batch_type=batch_type,
                trg_level=trg_level,
                max_output_length=max_output_length,
                eval_metrics=eval_metric,
                use_cuda=use_cuda,
                loss_function=None,
                beam_size=beam_size,
                beam_alpha=beam_alpha,
                save_attention=save_attention,
                validate_by_label=validate_by_label,
                forced_sparsity=forced_sparsity,
                method=method,
                max_hyps=max_hyps,
                break_at_p=test_cfg.get("break_at_p", 1.0),
                break_at_argmax=test_cfg.get("break_at_argmax", False),
                short_depth=test_cfg.get("short_depth", 0))
            scores = valid_results[0]
            hypotheses, hypotheses_raw = valid_results[2:4]
            scores_by_label = valid_results[5]

            if "trg" in data_set.fields:
                log_scores(logger, data_set_name, scores, scores_by_label,
                           beam_size, beam_alpha)
            else:
                logger.info("No references given for %s -> no evaluation.",
                            data_set_name)

            attention_scores = valid_results[4]
            if save_attention and not attention_scores:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not "
                               "available when using beam search. "
                               "Set beam_size to 0 for greedy decoding.")
            if save_attention and attention_scores:
                # currently this will break for transformers
                logger.info("Saving attention plots. This might be slow.")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=[s for s in data_set.src],
                                      indices=range(len(hypotheses)),
                                      model_dir=model_dir,
                                      steps=step,
                                      data_set_name=data_set_name)
                logger.info("Attention plots saved to: %s", model_dir)

            if output_path is not None:
                output_path_set = "{}.{}".format(output_path, data_set_name)
                with open(output_path_set, mode="w", encoding="utf-8") as outf:
                    for hyp in hypotheses:
                        outf.write(hyp + "\n")
                logger.info("Translations saved to: %s", output_path_set)
Esempio n. 20
0
    def testDataLoading(self):
        # test all combinations of configuration settings
        for test_path in [None, self.test_path]:
            for level in self.levels:
                for lowercase in [True, False]:
                    current_cfg = self.data_cfg.copy()
                    current_cfg["level"] = level
                    current_cfg["lowercase"] = lowercase
                    if test_path is not None:
                        current_cfg["test"] = test_path

                    # load the data
                    data = load_data(current_cfg)
                    train_data = data["train_data"]
                    dev_data = data["dev_data"]
                    test_data = data["test_data"]
                    vocabs = data["vocabs"]
                    src_vocab = vocabs["src"]
                    trg_vocab = vocabs["trg"]

                    self.assertIs(type(train_data), TranslationDataset)
                    self.assertIs(type(dev_data), TranslationDataset)
                    if test_path is not None:
                        # test has no target side
                        self.assertIs(type(test_data), MonoDataset)

                    # check the number of examples loaded
                    # training set is filtered to max_sent_length
                    expected_train_len = 5 if level == "char" else 382
                    expected_testdev_len = 20  # dev and test have the same len
                    self.assertEqual(len(train_data), expected_train_len)
                    self.assertEqual(len(dev_data), expected_testdev_len)
                    if test_path is None:
                        self.assertIsNone(test_data)
                    else:
                        self.assertEqual(len(test_data), expected_testdev_len)

                    # check the segmentation: src and trg attributes are lists
                    for corpus in [train_data, dev_data]:
                        for side in ["src", "trg"]:
                            toks = corpus.examples[0].__dict__[side]
                            self.assertIs(type(toks), list)
                    if test_path is not None:
                        self.assertIs(type(test_data.examples[0].src), list)
                        self.assertFalse(hasattr(test_data.examples[0], "trg"))

                    # check the length filtering of the training examples
                    for side in ["src", "trg"]:
                        self.assertFalse(
                            any(len(ex.__dict__[side]) > self.max_sent_length
                                for ex in train_data.examples)
                        )

                    # check the lowercasing
                    if lowercase:
                        for corpus in [train_data, dev_data]:
                            for side in ["src", "trg"]:
                                self.assertTrue(
                                    all(" ".join(ex.__dict__[side]).islower()
                                        for ex in corpus.examples)
                                )
                        if test_path is not None:
                            self.assertTrue(
                                all(" ".join(ex.src).islower()
                                    for ex in test_data.examples)
                            )

                    # check the first example from the training set
                    expected_srcs = {"char": "Danke.",
                                     "word": "David Gallo: Das ist Bill Lange."
                                             " Ich bin Dave Gallo."}
                    expected_trgs = {"char": "Thank you.",
                                     "word": "David Gallo: This is Bill Lange. "
                                             "I'm Dave Gallo."}
                    exp_src = expected_srcs[level]
                    exp_trg = expected_trgs[level]
                    if lowercase:
                        exp_src = exp_src.lower()
                        exp_trg = exp_trg.lower()
                    if level == "char":
                        comp_src = list(exp_src)
                        comp_trg = list(exp_trg)
                    else:
                        comp_src = exp_src.split()
                        comp_trg = exp_trg.split()
                    self.assertEqual(train_data.examples[0].src, comp_src)
                    self.assertEqual(train_data.examples[0].trg, comp_trg)
Esempio n. 21
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    """

    cfg = load_config(cfg_file)

    if "test" not in cfg["data"].keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = cfg["training"]["batch_size"]
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    _, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 0
        beam_alpha = -1

    for data_set_name, data_set in data_to_predict.items():
        if data_set is None:
            # e.g. no valid_data
            continue

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores, logprobs = validate_on_data(
            model, data=data_set, batch_size=batch_size, level=level,
            max_output_length=max_output_length, eval_metric=eval_metric,
            use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
            beam_alpha=beam_alpha)
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}".\
                    format(beam_size, beam_alpha)
            print("{:4s} {}: {} [{}]".format(data_set_name, eval_metric, score,
                                             decoding_description))
        else:
            print("No references given for {} -> no evaluation.".format(
                data_set_name))

        if attention_scores is not None and save_attention:
            attention_path = "{}/{}.{}.att".format(model_dir, data_set_name,
                                                   step)
            print("Attention plots saved to: {}.xx".format(attention_path))
            store_attention_plots(attentions=attention_scores,
                                  targets=hypotheses_raw,
                                  sources=[s for s in data_set.src],
                                  indices=range(len(hypotheses)),
                                  output_prefix=attention_path)

        if output_path is not None:
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                if cfg["data"].get("post_process", True):
                    for hyp in hypotheses:
                        out_file.write(hyp + "\n")
                else:
                    for hyp in hypotheses_raw:
                        out_file.write(" ".join(hyp) + "\n")
            print("Translations saved to: {}".format(output_path_set))
Esempio n. 22
0
def train_transfer(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["pretraining"].get("random_seed", 42))

    # load the data
    pre_train_data, pre_dev_data, pre_test_data, pre_src_vocab, pre_trg_vocab = load_data(
        data_cfg=cfg["pretrained_data"])

    # build an encoder-decoder model
    pretrained_model = build_model(cfg["model"],
                                   src_vocab=pre_src_vocab,
                                   trg_vocab=pre_trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=pretrained_model,
                           config=cfg,
                           training_key="pretraining",
                           name_log="pre_train")

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=pre_train_data,
                  valid_data=pre_dev_data,
                  test_data=pre_test_data,
                  src_vocab=pre_src_vocab,
                  trg_vocab=pre_trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(pretrained_model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["pretraining"]["model_dir"])
    pre_src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["pretraining"]["model_dir"])
    pre_trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=pre_train_data,
                               valid_data=pre_dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file,
         ckpt=ckpt,
         output_path=output_path,
         logger=trainer.logger,
         key_training="pretraining",
         key_data="pretrained_data")

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_pretrained_model(cfg["model"],
                                   pretrained_model=pretrained_model,
                                   pretrained_src_vocab=pre_src_vocab,
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg, training_key="training")

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file,
         ckpt=ckpt,
         output_path=output_path,
         logger=trainer.logger,
         key_training="training",
         key_data="data")
Esempio n. 23
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False,
         logger: Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = make_logger()

    cfg = load_config(cfg_file)

    if "test" not in cfg["data"].keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"]["batch_size"])
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    _, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 1)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 1
        beam_alpha = -1

    for data_set_name, data_set in data_to_predict.items():

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores,valid_hypotheses_full_n_best,scores = validate_on_data(
            model, data=data_set, batch_size=batch_size,
            batch_type=batch_type, level=level,
            max_output_length=max_output_length, eval_metric=eval_metric,
            use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
            beam_alpha=beam_alpha, logger=logger)
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            decoding_description = "Greedy decoding" if beam_size < 2 else \
                "Beam search decoding with beam size = {} and alpha = {}".\
                    format(beam_size, beam_alpha)
            logger.info("%4s %s: %6.2f [%s]", data_set_name, eval_metric,
                        score, decoding_description)
        else:
            logger.info("No references given for %s -> no evaluation.",
                        data_set_name)

        if save_attention:
            if attention_scores:
                attention_name = "{}.{}.att".format(data_set_name, step)
                attention_path = os.path.join(model_dir, attention_name)
                logger.info(
                    "Saving attention plots. This might take a while..")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=data_set.src,
                                      indices=range(len(hypotheses)),
                                      output_prefix=attention_path)
                logger.info("Attention plots saved to: %s", attention_path)
            else:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not available "
                               "when using beam search. "
                               "Set beam_size to 1 for greedy decoding.")

        if output_path is not None:
            '''
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")


            #sy_debug
            alt_output = "{}.n_best.{}".format(output_path, data_set_name)
            with open(alt_output, mode="w", encoding="utf-8") as out_file:
                for n in valid_hypotheses_full_n_best:
                    out_file.write(n + "\n")
'''

            #@Shiya: exporting hypothesis and associated score to .csv file
            #TODO: write_to_csv(hyps,scores)
            def write_to_csv(hyps: list, scores: list):
                import csv

                output_file = "{}.n_csv.{}".format(output_path, data_set_name)
                with open(output_file, mode="w", newline='',
                          encoding="utf-8") as out_file:
                    fieldnames = ['Predictions', 'Scores']
                    writer = csv.DictWriter(out_file, fieldnames=fieldnames)
                    writer.writeheader()

                    for prediction, score in zip(hyps, scores):
                        writer.writerow({
                            fieldnames[0]: prediction,
                            fieldnames[1]: score
                        })

            write_to_csv(valid_hypotheses_full_n_best, scores)
Esempio n. 24
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))
    shards_dir = os.path.dirname(cfg["data"]["shard_path"])
    if not os.path.exists(shards_dir):
        os.makedirs(shards_dir)

    if cfg["data"].get("shard_data", False):
        assert cfg["data"].get(
            "n_shards", 0) > 0, "n_shards needs to exist and be at least 1"
        shard_data(path=cfg["data"]["train"],
                   src_lang=cfg["data"]["src"],
                   tgt_lang=cfg["data"]["trg"],
                   n_shards=cfg["data"]["n_shards"],
                   shard_path=cfg["data"]["shard_path"])

    # load the data
    load_train_whole = True if cfg["data"].get("n_shards", 0) < 1 else False
    train_data, dev_data, test_data, src_vocab, trg_vocab, src_field, trg_field = load_data(
        data_cfg=cfg["data"], load_train=load_train_whole)

    if not load_train_whole:
        sharded_iterator = ShardedEpochDatasetIterator(
            n_shards=cfg["data"]["n_shards"],
            percent_to_sample=cfg["data"].get("percent_to_sample_from_shard",
                                              1.0),
            data_path=cfg["data"]["train"],
            shard_path=cfg["data"]["shard_path"],
            extensions=(cfg["data"]["src"], cfg["data"]["trg"]),
            fields=(src_field, trg_field),
            n_epochs=cfg["training"]["epochs"],
            filter_pred=lambda x: len(vars(x)[
                'src']) <= cfg["data"]["max_sent_length"] and len(
                    vars(x)['trg']) <= cfg["data"]["max_sent_length"])
    else:
        sharded_iterator = None

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)
    if load_train_whole:
        log_data_info(train_data=train_data,
                      valid_data=dev_data,
                      test_data=test_data,
                      src_vocab=src_vocab,
                      trg_vocab=trg_vocab,
                      logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data,
                               valid_data=dev_data,
                               sharded_iterator=sharded_iterator)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Esempio n. 25
0
    def __init__(self,
                 cfg_file,
                 ckpt: str,
                 output_path: str = None,
                 logger: Logger = None) -> None:
        """
        Recover the saved model, specified as in configuration.

        :param cfg_file: path to configuration file
        :param ckpt: path to checkpoint to load
        :param output_path: path to output
        :param logger: log output to this logger (creates new logger if not set)
        """

        if logger is None:
            logger = make_logger()

        cfg = load_config(cfg_file)

        if "test" not in cfg["data"].keys():
            raise ValueError("Test data must be specified in config.")

        #print(cfg.keys())
        if "dqn" not in cfg.keys():
            raise ValueError("dqn data must be specified in config.")
        self.model_dir = cfg["training"]["model_dir"]
        # when checkpoint is not specified, take latest (best) from model dir
        if ckpt is None:
            model_dir = cfg["training"]["model_dir"]
            ckpt = get_latest_checkpoint(model_dir)
            if ckpt is None:
                raise FileNotFoundError(
                    "No checkpoint found in directory {}.".format(model_dir))
            try:
                step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
            except IndexError:
                step = "best"

        self.batch_size = 1  #**
        self.batch_type = cfg["training"].get(
            "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
        self.use_cuda = cfg["training"].get("use_cuda", False)
        self.level = cfg["data"]["level"]
        self.eval_metric = cfg["training"]["eval_metric"]
        self.max_output_length = cfg["training"].get("max_output_length", None)

        # load the data
        train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])
        #Loading the DQN parameters:
        self.sample_size = cfg["dqn"]["sample_size"]
        self.lr = cfg["dqn"].get("lr", 0.01)
        self.egreed_max = cfg["dqn"].get("egreed_max", 0.9)
        self.egreed_min = cfg["dqn"].get("egreed_min", 0.01)
        self.gamma_max = cfg["dqn"].get("gamma_max", 0.9)
        self.gamma_min = cfg["dqn"].get("gamma_min", 0.5)
        self.nu_iter = cfg["dqn"]["nu_iter"]
        self.mem_cap = cfg["dqn"]["mem_cap"]
        self.beam_min = cfg["dqn"]["beam_min"]
        self.beam_max = cfg["dqn"]["beam_max"]
        self.state_type = cfg["dqn"]["state_type"]

        if self.state_type == 'hidden':
            self.state_size = cfg["model"]["encoder"]["hidden_size"] * 2
        else:
            self.state_size = cfg["model"]["encoder"]["hidden_size"]

        self.actions_size = len(src_vocab)
        self.gamma = None

        print("Sample size: ", self.sample_size)
        print("State size: ", self.state_size)
        print("Action size: ", self.actions_size)
        self.epochs = cfg["dqn"]["epochs"]

        # Inii the Qnet and Qnet2
        self.eval_net = Net(self.state_size, self.actions_size)
        self.target_net = Net(self.state_size, self.actions_size)

        #Following the algorithm
        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0
        self.memory_counter = 0
        self.size_memory1 = self.state_size * 2 + 2 + 1
        self.memory = np.zeros((self.mem_cap, self.size_memory1))
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                          lr=self.lr)
        self.loss_func = nn.MSELoss()

        #others parameters
        self.bos_index = trg_vocab.stoi[BOS_TOKEN]
        self.eos_index = trg_vocab.stoi[EOS_TOKEN]
        self.pad_index = trg_vocab.stoi[PAD_TOKEN]

        self.data_to_train_dqn = {"train": train_data}

        #self.data_to_train_dqn = {"test": test_data}
        #self.data_to_dev = {"dev": dev_data}
        self.data_to_dev = {"dev": dev_data}
        #self.data_to_train_dqn = {"train": train_data
        #                          ,"dev": dev_data, "test": test_data}
        # load model state from disk
        model_checkpoint = load_checkpoint(ckpt, use_cuda=self.use_cuda)

        # build model and load parameters into it
        self.model = build_model(cfg["model"],
                                 src_vocab=src_vocab,
                                 trg_vocab=trg_vocab)
        self.model.load_state_dict(model_checkpoint["model_state"])

        if self.use_cuda:
            self.model.cuda()

        # whether to use beam search for decoding, 0: greedy decoding
        beam_size = 1
        beam_alpha = -1

        #others not important parameters
        self.index_fin = None
        path_tensroboard = self.model_dir + "/tensorboard_DQN/"
        self.tb_writer = SummaryWriter(log_dir=path_tensroboard, purge_step=0)
        self.dev_network_count = 0
        print(cfg["dqn"]["reward_type"])
        #Reward funtion related:
        if cfg["dqn"]["reward_type"] == "bleu_diff":
            print("You select the reward based on the Bleu score differences")
            self.Reward = self.Reward_bleu_diff
        elif cfg["dqn"]["reward_type"] == "bleu_lin":
            print(
                "You select the reward based on the linear Bleu socres, and several punishments"
            )
            self.Reward = self.Reward_lin
        else:
            print(
                "You select the reward based on the final score on the last state "
            )
            self.Reward = self.Reward_bleu_fin
Esempio n. 26
0
def test(cfg_file,
         ckpt: str,
         batch_class: Batch = Batch,
         output_path: str = None,
         save_attention: bool = False,
         datasets: dict = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param batch_class: class type of batch
    :param output_path: path to output
    :param datasets: datasets to predict
    :param save_attention: whether to save the computed attention weights
    """

    cfg = load_config(cfg_file)
    model_dir = cfg["training"]["model_dir"]

    if len(logger.handlers) == 0:
        _ = make_logger(model_dir, mode="test")  # version string returned

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        ckpt = get_latest_checkpoint(model_dir)
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    # load the data
    if datasets is None:
        _, dev_data, test_data, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"], datasets=["dev", "test"])
        data_to_predict = {"dev": dev_data, "test": test_data}
    else:  # avoid to load data again
        data_to_predict = {"dev": datasets["dev"], "test": datasets["test"]}
        src_vocab = datasets["src_vocab"]
        trg_vocab = datasets["trg_vocab"]

    # parse test args
    batch_size, batch_type, use_cuda, device, n_gpu, level, eval_metric, \
        max_output_length, beam_size, beam_alpha, postprocess, \
        bpe_type, sacrebleu, decoding_description, tokenizer_info \
        = parse_test_args(cfg, mode="test")

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.to(device)

    # multi-gpu eval
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = _DataParallel(model)

    for data_set_name, data_set in data_to_predict.items():
        if data_set is None:
            continue

        dataset_file = cfg["data"][data_set_name] + "." + cfg["data"]["trg"]
        logger.info("Decoding on %s set (%s)...", data_set_name, dataset_file)

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores = validate_on_data(
            model, data=data_set, batch_size=batch_size,
            batch_class=batch_class, batch_type=batch_type, level=level,
            max_output_length=max_output_length, eval_metric=eval_metric,
            use_cuda=use_cuda, compute_loss=False, beam_size=beam_size,
            beam_alpha=beam_alpha, postprocess=postprocess,
            bpe_type=bpe_type, sacrebleu=sacrebleu, n_gpu=n_gpu)
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            logger.info("%4s %s%s: %6.2f [%s]", data_set_name, eval_metric,
                        tokenizer_info, score, decoding_description)
        else:
            logger.info("No references given for %s -> no evaluation.",
                        data_set_name)

        if save_attention:
            if attention_scores:
                attention_name = "{}.{}.att".format(data_set_name, step)
                attention_path = os.path.join(model_dir, attention_name)
                logger.info(
                    "Saving attention plots. This might take a while..")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=data_set.src,
                                      indices=range(len(hypotheses)),
                                      output_prefix=attention_path)
                logger.info("Attention plots saved to: %s", attention_path)
            else:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not available "
                               "when using beam search. "
                               "Set beam_size to 1 for greedy decoding.")

        if output_path is not None:
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s", output_path_set)
Esempio n. 27
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # make logger
    model_dir = make_model_dir(cfg["training"]["model_dir"],
                               overwrite=cfg["training"].get(
                                   "overwrite", False))
    _ = make_logger(model_dir, mode="train")  # version string returned
    # TODO: save version number in model checkpoints

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_tasks_list = []
    valid_tasks_list = []
    src_tasks = cfg["data"].get("src")
    trg_tasks = cfg["data"].get("trg")

    for x in range(len(src_tasks)):
        src_lang = src_tasks[x]
        trg_lang = trg_tasks[x]
        train_data, dev_data, _, _, _ = load_data(data_cfg=cfg["data"],
                                                  src_lang=src_lang,
                                                  trg_lang=trg_lang)
        train_tasks_list.append(train_data)
        valid_tasks_list.append(dev_data)

    #build vocabulary

    logger.info("Building vocabulary...")

    src_max_size = cfg["data"].get("src_voc_limit", sys.maxsize)
    src_min_freq = cfg["data"].get("src_voc_min_freq", 1)
    trg_max_size = cfg["data"].get("trg_voc_limit", sys.maxsize)
    trg_min_freq = cfg["data"].get("trg_voc_min_freq", 1)

    src_vocab_file = cfg["data"].get("src_vocab", None)
    trg_vocab_file = cfg["data"].get("trg_vocab", None)

    src_vocab = build_vocab(field="src",
                            min_freq=src_min_freq,
                            max_size=src_max_size,
                            dataset=train_tasks_list[0],
                            vocab_file=src_vocab_file)
    trg_vocab = build_vocab(field="trg",
                            min_freq=trg_min_freq,
                            max_size=trg_max_size,
                            dataset=train_tasks_list[0],
                            vocab_file=trg_vocab_file)

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=src_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg)

    # log_data_info(train_data=train_data,
    #               valid_data=dev_data,
    #               test_data=test_data,
    #               src_vocab=src_vocab,
    #               trg_vocab=trg_vocab)

    logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.maml_train_and_validate(train_tasks=train_tasks_list,
                                    valid_tasks=valid_tasks_list)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(model_dir, trainer.stats.best_ckpt_iter)
    output_name = "{:08d}.hyps".format(trainer.stats.best_ckpt_iter)
    output_path = os.path.join(model_dir, output_name)