def testRandomSubset(self): # only a random subset should be selected for training current_cfg = self.data_cfg.copy() current_cfg["random_train_subset"] = -1 # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = \ load_data(current_cfg) assert len(train_data) == 382 current_cfg["random_train_subset"] = 10 train_data, dev_data, test_data, src_vocab, trg_vocab = \ load_data(current_cfg) assert len(train_data) == 10
def testIteratorBatchType(self): current_cfg = self.data_cfg.copy() # load toy data train_data, dev_data, test_data, src_vocab, trg_vocab = \ load_data(current_cfg) # make batches by number of sentences train_iter = iter( make_data_iter(train_data, batch_size=10, batch_type="sentence")) batch = next(train_iter) self.assertEqual(batch.src[0].shape[0], 10) self.assertEqual(batch.trg[0].shape[0], 10) # make batches by number of tokens train_iter = iter( make_data_iter(train_data, batch_size=100, batch_type="token")) _ = next(train_iter) # skip a batch _ = next(train_iter) # skip another batch batch = next(train_iter) self.assertEqual(batch.src[0].shape[0], 8) self.assertEqual(np.prod(batch.src[0].shape), 88) self.assertLessEqual(np.prod(batch.src[0].shape), 100)
def setUp(self): self.train_path = "test/data/toy/train" self.dev_path = "test/data/toy/dev" self.test_path = "test/data/toy/test" self.levels = ["char", "word"] # bpe is equivalently processed to word self.max_sent_length = 20 # minimal data config self.data_cfg = { "src": "de", "trg": "en", "train": self.train_path, "dev": self.dev_path, "level": "char", "lowercase": True, "max_sent_length": self.max_sent_length } # load the data self.train_data, self.dev_data, self.test_data, src_vocab, trg_vocab = \ load_data(self.data_cfg) self.pad_index = trg_vocab.stoi[PAD_TOKEN] # random seeds seed = 42 torch.manual_seed(seed) random.seed(42)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data if cfg.get("speech", True): train_data, dev_data, test_data, src_vocab, trg_vocab = load_audio_data( cfg=cfg) else: train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model if cfg.get("speech", True): model = build_speech_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) else: model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def testDataLoading(self): # test all combinations of configuration settings for test_path in [None, self.test_path]: for level in self.levels: for lowercase in [True, False]: current_cfg = self.data_cfg.copy() current_cfg["level"] = level current_cfg["lowercase"] = lowercase if test_path is not None: current_cfg["test"] = test_path # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = \ load_data(current_cfg) self.assertIs(type(train_data), TranslationDataset) self.assertIs(type(dev_data), TranslationDataset) if test_path is not None: # test has no target side self.assertIs(type(test_data), MonoDataset) # check the number of examples loaded if level == "char": # training set is filtered to max_sent_length expected_train_len = 5 else: expected_train_len = 382 expected_testdev_len = 20 # dev and test have the same len self.assertEqual(len(train_data), expected_train_len) self.assertEqual(len(dev_data), expected_testdev_len) if test_path is None: self.assertIsNone(test_data) else: self.assertEqual(len(test_data), expected_testdev_len) # check the segmentation: src and trg attributes are lists self.assertIs(type(train_data.examples[0].src), list) self.assertIs(type(train_data.examples[0].trg), list) self.assertIs(type(dev_data.examples[0].src), list) self.assertIs(type(dev_data.examples[0].trg), list) if test_path is not None: self.assertIs(type(test_data.examples[0].src), list) self.assertFalse(hasattr(test_data.examples[0], "trg")) # check the length filtering of the training examples self.assertFalse( any([ len(ex.src) > self.max_sent_length for ex in train_data.examples ])) self.assertFalse( any([ len(ex.trg) > self.max_sent_length for ex in train_data.examples ])) # check the lowercasing if lowercase: self.assertTrue( all([ " ".join(ex.src).lower() == " ".join(ex.src) for ex in train_data.examples ])) self.assertTrue( all([ " ".join(ex.src).lower() == " ".join(ex.src) for ex in dev_data.examples ])) self.assertTrue( all([ " ".join(ex.trg).lower() == " ".join(ex.trg) for ex in train_data.examples ])) self.assertTrue( all([ " ".join(ex.trg).lower() == " ".join(ex.trg) for ex in dev_data.examples ])) if test_path is not None: self.assertTrue( all([ " ".join(ex.src).lower() == " ".join( ex.src) for ex in test_data.examples ])) # check the first example from the training set expected_srcs = { "char": "Danke.", "word": "David Gallo: Das ist Bill Lange." " Ich bin Dave Gallo." } expected_trgs = { "char": "Thank you.", "word": "David Gallo: This is Bill Lange. " "I'm Dave Gallo." } if level == "char": if lowercase: comparison_src = list(expected_srcs[level].lower()) comparison_trg = list(expected_trgs[level].lower()) else: comparison_src = list(expected_srcs[level]) comparison_trg = list(expected_trgs[level]) else: if lowercase: comparison_src = expected_srcs[level].lower().\ split() comparison_trg = expected_trgs[level].lower(). \ split() else: comparison_src = expected_srcs[level].split() comparison_trg = expected_trgs[level].split() self.assertEqual(train_data.examples[0].src, comparison_src) self.assertEqual(train_data.examples[0].trg, comparison_trg)
def filter_noise(cfg_file, ckpt: str, output_path: str = None, logger: Logger = None) -> None: """ Main test function. Handles loading a model from checkpoint, generating translations and storing them and attention plots. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output :param logger: log output to this logger (creates new logger if not set) """ if logger is None: logger = make_logger() cfg = load_config(cfg_file) # when checkpoint is not specified, take latest (best) from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError( "No checkpoint found in directory {}.".format(model_dir)) try: step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0] except IndexError: step = "best" use_cuda = cfg["training"].get("use_cuda", False) max_output_length = cfg["training"].get("max_output_length", None) # load the data if cfg.get("speech", True): train_data, _, _, src_vocab, trg_vocab = load_audio_data(cfg=cfg) else: train_data, _, _, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) data_to_predict = ("train", train_data) # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it if cfg.get("speech", True): model = build_speech_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) else: model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() pad_index = model.pad_index label_smoothing = 0.0 loss_function = XentLoss(pad_index=pad_index, smoothing=label_smoothing) data_set_name, data_set = data_to_predict #pylint: disable=unused-variable ppls_list = generate_perplexities_on_data( model, data=data_set, max_output_length=max_output_length, use_cuda=use_cuda, loss_function=loss_function, logger=logger) #pylint: enable=unused-variable if output_path is None: raise ValueError("Output path must be specified") else: if not os.path.isdir(output_path): os.makedirs(output_path) output_path_set = os.path.join(output_path, data_set_name + "_perplexities.txt") with open(output_path_set, "w") as outfile: first_iteration = True for ppls in ppls_list: if not first_iteration: outfile.write("\n") outfile.write(str(ppls)) first_iteration = False logger.info("Perplexities saved to: %s", output_path_set)
def test(cfg_file, ckpt: str, output_path: str = None, save_attention: bool = False, logger: Logger = None) -> None: """ Main test function. Handles loading a model from checkpoint, generating translations and storing them and attention plots. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output :param save_attention: whether to save the computed attention weights :param logger: log output to this logger (creates new logger if not set) """ if logger is None: logger = make_logger() cfg = load_config(cfg_file) if "test" not in cfg["data"].keys(): raise ValueError("Test data must be specified in config.") # when checkpoint is not specified, take latest (best) from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError( "No checkpoint found in directory {}.".format(model_dir)) try: step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0] except IndexError: step = "best" batch_size = cfg["training"].get("eval_batch_size", cfg["training"]["batch_size"]) batch_type = cfg["training"].get( "eval_batch_type", cfg["training"].get("batch_type", "sentence")) use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] eval_metric = cfg["training"]["eval_metric"] max_output_length = cfg["training"].get("max_output_length", None) # load the data if cfg.get("speech", True): _, dev_data, test_data, src_vocab, trg_vocab = load_audio_data(cfg=cfg) else: _, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) data_to_predict = {"dev": dev_data, "test": test_data} # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it if cfg.get("speech", True): model = build_speech_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) else: model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 1) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 1 beam_alpha = -1 for data_set_name, data_set in data_to_predict.items(): #pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=data_set, batch_size=batch_size, batch_type=batch_type, level=level, max_output_length=max_output_length, eval_metric=eval_metric, use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha, logger=logger) #pylint: enable=unused-variable if "trg" in data_set.fields: decoding_description = "Greedy decoding" if beam_size < 2 else \ "Beam search decoding with beam size = {} and alpha = {}".\ format(beam_size, beam_alpha) logger.info("%4s %s: %6.4f [%s]", data_set_name, eval_metric, score, decoding_description) else: logger.info("No references given for %s -> no evaluation.", data_set_name) if save_attention: if attention_scores: attention_name = "{}.{}.att".format(data_set_name, step) attention_path = os.path.join(model_dir, attention_name) logger.info( "Saving attention plots. This might take a while..") store_attention_plots(attentions=attention_scores, targets=hypotheses_raw, sources=data_set.src, indices=range(len(hypotheses)), output_prefix=attention_path) logger.info("Attention plots saved to: %s", attention_path) else: logger.warning("Attention scores could not be saved. " "Note that attention scores are not available " "when using beam search. " "Set beam_size to 1 for greedy decoding.") if output_path is not None: output_path_set = "{}.{}".format(output_path, data_set_name) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") logger.info("Translations saved to: %s", output_path_set)