def test_tied_src_trg_softmax(self): # test source embedding, target embedding, and softmax tying torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) cfg["model"]["decoder"]["type"] = "transformer" cfg["model"]["tied_embeddings"] = True cfg["model"]["tied_softmax"] = True cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64 cfg["model"]["encoder"]["embeddings"]["embedding_dim"] = 64 src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) src_weight = model.src_embed.lut.weight trg_weight = model.trg_embed.lut.weight output_weight = model.decoder.output_layer.weight self.assertTensorEqual(src_weight, trg_weight) self.assertTensorEqual(src_weight, output_weight) self.assertEqual(src_weight.shape, trg_weight.shape) self.assertEqual(trg_weight.shape, output_weight.shape) output_weight.data.fill_(3.) self.assertEqual(output_weight.sum().item(), 6528) self.assertEqual(output_weight.sum().item(), src_weight.sum().item()) self.assertEqual(output_weight.sum().item(), trg_weight.sum().item()) self.assertEqual(src_weight.sum().item(), trg_weight.sum().item())
def test_tied_embeddings(self): torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) cfg["model"]["tied_embeddings"] = True cfg["model"]["tied_softmax"] = False src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) self.assertEqual(src_vocab.itos, trg_vocab.itos) self.assertEqual(model.src_embed, model.trg_embed) self.assertTensorEqual(model.src_embed.lut.weight, model.trg_embed.lut.weight) self.assertEqual(model.src_embed.lut.weight.shape, model.trg_embed.lut.weight.shape)
def test_tied_softmax(self): torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) cfg["model"]["decoder"]["type"] = "transformer" cfg["model"]["tied_embeddings"] = False cfg["model"]["tied_softmax"] = True cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64 src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) self.assertEqual(model.trg_embed.lut.weight.shape, model.decoder.output_layer.weight.shape) self.assertTensorEqual(model.trg_embed.lut.weight, model.decoder.output_layer.weight)
def test_transformer_layer_norm_init(self): torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) def check_layer_norm(m: nn.Module): for name, child in m.named_children(): if isinstance(child, nn.LayerNorm): self.assertTensorEqual(child.weight, torch.ones([self.hidden_size])) self.assertTensorEqual(child.bias, torch.zeros([self.hidden_size])) else: check_layer_norm(child) check_layer_norm(model)
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data if cfg.get("speech", True): train_data, dev_data, test_data, src_vocab, trg_vocab = load_audio_data( cfg=cfg) else: train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model if cfg.get("speech", True): model = build_speech_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) else: model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def filter_noise(cfg_file, ckpt: str, output_path: str = None, logger: Logger = None) -> None: """ Main test function. Handles loading a model from checkpoint, generating translations and storing them and attention plots. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output :param logger: log output to this logger (creates new logger if not set) """ if logger is None: logger = make_logger() cfg = load_config(cfg_file) # when checkpoint is not specified, take latest (best) from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError( "No checkpoint found in directory {}.".format(model_dir)) try: step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0] except IndexError: step = "best" use_cuda = cfg["training"].get("use_cuda", False) max_output_length = cfg["training"].get("max_output_length", None) # load the data if cfg.get("speech", True): train_data, _, _, src_vocab, trg_vocab = load_audio_data(cfg=cfg) else: train_data, _, _, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) data_to_predict = ("train", train_data) # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it if cfg.get("speech", True): model = build_speech_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) else: model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() pad_index = model.pad_index label_smoothing = 0.0 loss_function = XentLoss(pad_index=pad_index, smoothing=label_smoothing) data_set_name, data_set = data_to_predict #pylint: disable=unused-variable ppls_list = generate_perplexities_on_data( model, data=data_set, max_output_length=max_output_length, use_cuda=use_cuda, loss_function=loss_function, logger=logger) #pylint: enable=unused-variable if output_path is None: raise ValueError("Output path must be specified") else: if not os.path.isdir(output_path): os.makedirs(output_path) output_path_set = os.path.join(output_path, data_set_name + "_perplexities.txt") with open(output_path_set, "w") as outfile: first_iteration = True for ppls in ppls_list: if not first_iteration: outfile.write("\n") outfile.write(str(ppls)) first_iteration = False logger.info("Perplexities saved to: %s", output_path_set)
def translate(cfg_file, ckpt: str, output_path: str = None) -> None: """ Interactive translation function. Loads model from checkpoint and translates either the stdin input or asks for input to translate interactively. The input has to be pre-processed according to the data that the model was trained on, i.e. tokenized or split into subwords. Translations are printed to stdout. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output file """ def _load_line_as_data(line): """ Create a dataset from one line via a temporary file. """ # write src input to temporary file tmp_name = "tmp" tmp_suffix = ".src" tmp_filename = tmp_name + tmp_suffix with open(tmp_filename, "w") as tmp_file: tmp_file.write("{}\n".format(line)) test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field) # remove temporary file if os.path.exists(tmp_filename): os.remove(tmp_filename) return test_data cfg = load_config(cfg_file) speech_mode = cfg.get("speech", True) if speech_mode: raise NotImplementedError( "Translation mode isn't implemented for speech processing yet.") logger = make_logger() def _translate_data(test_data): """ Translates given dataset, using parameters from outer scope. """ # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=test_data, batch_size=batch_size, batch_type=batch_type, level=level, max_output_length=max_output_length, eval_metric="", use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha, logger=logger) return hypotheses # when checkpoint is not specified, take oldest from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) batch_size = cfg["training"].get("eval_batch_size", cfg["training"].get("batch_size", 1)) batch_type = cfg["training"].get( "eval_batch_type", cfg["training"].get("batch_type", "sentence")) use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] max_output_length = cfg["training"].get("max_output_length", None) # read vocabs src_vocab_file = cfg["data"].get( "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt") trg_vocab_file = cfg["data"].get( "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt") src_vocab = Vocabulary(file=src_vocab_file) trg_vocab = Vocabulary(file=trg_vocab_file) data_cfg = cfg["data"] level = data_cfg["level"] lowercase = data_cfg["lowercase"] def tok_fun(s): return list(s) if level == "char" else s.split() src_field = Field(init_token=None, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) src_field.vocab = src_vocab # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, <2: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 1) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 1 beam_alpha = -1 if not sys.stdin.isatty(): # input file given test_data = MonoDataset(path=sys.stdin, ext="", field=src_field) hypotheses = _translate_data(test_data) if output_path is not None: # write to outputfile if given output_path_set = "{}".format(output_path) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") logger.info("Translations saved to: %s.", output_path_set) else: # print to stdout for hyp in hypotheses: print(hyp) else: # enter interactive mode batch_size = 1 batch_type = "sentence" while True: try: src_input = input("\nPlease enter a source sentence " "(pre-processed): \n") if not src_input.strip(): break # every line has to be made into dataset test_data = _load_line_as_data(line=src_input) hypotheses = _translate_data(test_data) print("JoeyNMT: {}".format(hypotheses[0])) except (KeyboardInterrupt, EOFError): print("\nBye.") break
def test(cfg_file, ckpt: str, output_path: str = None, save_attention: bool = False, logger: Logger = None) -> None: """ Main test function. Handles loading a model from checkpoint, generating translations and storing them and attention plots. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output :param save_attention: whether to save the computed attention weights :param logger: log output to this logger (creates new logger if not set) """ if logger is None: logger = make_logger() cfg = load_config(cfg_file) if "test" not in cfg["data"].keys(): raise ValueError("Test data must be specified in config.") # when checkpoint is not specified, take latest (best) from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError( "No checkpoint found in directory {}.".format(model_dir)) try: step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0] except IndexError: step = "best" batch_size = cfg["training"].get("eval_batch_size", cfg["training"]["batch_size"]) batch_type = cfg["training"].get( "eval_batch_type", cfg["training"].get("batch_type", "sentence")) use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] eval_metric = cfg["training"]["eval_metric"] max_output_length = cfg["training"].get("max_output_length", None) # load the data if cfg.get("speech", True): _, dev_data, test_data, src_vocab, trg_vocab = load_audio_data(cfg=cfg) else: _, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) data_to_predict = {"dev": dev_data, "test": test_data} # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it if cfg.get("speech", True): model = build_speech_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) else: model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 1) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 1 beam_alpha = -1 for data_set_name, data_set in data_to_predict.items(): #pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=data_set, batch_size=batch_size, batch_type=batch_type, level=level, max_output_length=max_output_length, eval_metric=eval_metric, use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha, logger=logger) #pylint: enable=unused-variable if "trg" in data_set.fields: decoding_description = "Greedy decoding" if beam_size < 2 else \ "Beam search decoding with beam size = {} and alpha = {}".\ format(beam_size, beam_alpha) logger.info("%4s %s: %6.4f [%s]", data_set_name, eval_metric, score, decoding_description) else: logger.info("No references given for %s -> no evaluation.", data_set_name) if save_attention: if attention_scores: attention_name = "{}.{}.att".format(data_set_name, step) attention_path = os.path.join(model_dir, attention_name) logger.info( "Saving attention plots. This might take a while..") store_attention_plots(attentions=attention_scores, targets=hypotheses_raw, sources=data_set.src, indices=range(len(hypotheses)), output_prefix=attention_path) logger.info("Attention plots saved to: %s", attention_path) else: logger.warning("Attention scores could not be saved. " "Note that attention scores are not available " "when using beam search. " "Set beam_size to 1 for greedy decoding.") if output_path is not None: output_path_set = "{}.{}".format(output_path, data_set_name) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") logger.info("Translations saved to: %s", output_path_set)