def _build(self, batch_size): src_time_dim = 4 vocab_size = 7 emb = Embeddings(embedding_dim=self.emb_size, vocab_size=vocab_size, padding_idx=self.pad_index) encoder = RecurrentEncoder(emb_size=self.emb_size, num_layers=self.num_layers, hidden_size=self.encoder_hidden_size, bidirectional=True) decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=encoder, attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, init_hidden="bridge", input_feeding=True) encoder_output = torch.rand(size=(batch_size, src_time_dim, encoder.output_size)) for p in decoder.parameters(): torch.nn.init.uniform_(p, -0.5, 0.5) src_mask = torch.ones(size=(batch_size, 1, src_time_dim)) == 1 encoder_hidden = torch.rand(size=(batch_size, encoder.output_size)) return src_mask, emb, decoder, encoder_output, encoder_hidden
def test_recurrent_freeze(self): decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=self.encoders[0], attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, bridge=False, input_feeding=False, freeze=True) for n, p in decoder.named_parameters(): self.assertFalse(p.requires_grad)
def test_recurrent_decoder_size(self): # test all combinations of bridge, input_feeding, encoder directions for encoder in self.encoders: for bridge in [True, False]: for input_feeding in [True, False]: decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=encoder, attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, bridge=bridge, input_feeding=input_feeding) self.assertEqual(decoder.rnn.hidden_size, self.hidden_size) self.assertEqual(decoder.att_vector_layer.out_features, self.hidden_size) self.assertEqual(decoder.output_layer.out_features, self.vocab_size) self.assertEqual(decoder.output_size, self.vocab_size) self.assertEqual(decoder.rnn.bidirectional, False) if bridge: self.assertTrue(decoder.bridge) self.assertTrue(hasattr(decoder, "bridge_layer")) self.assertEqual(decoder.bridge_layer.out_features, self.hidden_size) else: self.assertFalse(decoder.bridge) self.assertFalse(hasattr(decoder, "bridge_layer")) if input_feeding: self.assertEqual(decoder.rnn_input_size, self.emb_size + self.hidden_size) else: self.assertEqual(decoder.rnn_input_size, self.emb_size)
def test_recurrent_decoder_type(self): valid_rnn_types = {"gru": GRU, "lstm": LSTM} for name, obj in valid_rnn_types.items(): decoder = RecurrentDecoder(rnn_type=name, hidden_size=self.hidden_size, encoder=self.encoders[0], attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, bridge=False, input_feeding=False) self.assertEqual(type(decoder.rnn), obj)
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) if cfg.get("tied_embeddings", False): if src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: raise ConfigurationError( "Embedding cannot be tied since vocabularies differ.") else: trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim) decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab) # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def test_recurrent_decoder_size(self): # test all combinations of bridge, input_feeding, encoder directions for encoder in self.encoders: for init_hidden in ["bridge", "zero", "last"]: for input_feeding in [True, False]: decoder = RecurrentDecoder( hidden_size=self.hidden_size, encoder_output_size=encoder.output_size, attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, init_hidden=init_hidden, input_feeding=input_feeding) self.assertEqual(decoder.rnn.hidden_size, self.hidden_size) self.assertEqual(decoder.att_vector_layer.out_features, self.hidden_size) self.assertEqual( decoder.output_layers["vocab"].out_features, self.vocab_size) self.assertEqual(decoder.vocab_size, self.vocab_size) self.assertEqual(decoder.rnn.bidirectional, False) self.assertEqual(decoder.init_hidden_option, init_hidden) if init_hidden == "bridge": self.assertTrue(hasattr(decoder, "bridge_layer")) self.assertEqual(decoder.bridge_layer[0].out_features, self.hidden_size) self.assertEqual(decoder.bridge_layer[0].in_features, encoder.output_size) else: self.assertFalse(decoder.bridge_layer is not None) if input_feeding: self.assertEqual(decoder.rnn_input_size, self.emb_size + self.hidden_size) else: self.assertEqual(decoder.rnn_input_size, self.emb_size)
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None): src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) if cfg.get("tied_embeddings", False) \ and src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim) decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab) # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def test_recurrent_input_dropout(self): drop_prob = 0.5 decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=self.encoders[0], attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, bridge=False, input_feeding=False, dropout=drop_prob) input_tensor = torch.Tensor([2, 3, 1, -1]) decoder.train() dropped = decoder.rnn_input_dropout(input=input_tensor) # eval switches off dropout decoder.eval() no_drop = decoder.rnn_input_dropout(input=input_tensor) # when dropout is applied, remaining values are divided by drop_prob self.assertGreaterEqual((no_drop - (drop_prob * dropped)).abs().sum(), 0) drop_prob = 1.0 decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=self.encoders[0], attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, bridge=False, input_feeding=False, dropout=drop_prob) all_dropped = decoder.rnn_input_dropout(input=input_tensor) self.assertEqual(all_dropped.sum(), 0) decoder.eval() none_dropped = decoder.rnn_input_dropout(input=input_tensor) self.assertTensorEqual(no_drop, none_dropped) self.assertTensorEqual((no_drop - all_dropped), no_drop)
def test_recurrent_forward(self): time_dim = 4 batch_size = 2 # make sure the outputs match the targets decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=self.encoders[0], attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, bridge=False, input_feeding=False) encoder_states = torch.rand(size=(batch_size, time_dim, self.encoders[0].output_size)) trg_inputs = torch.ones(size=(batch_size, time_dim, self.emb_size)) # no padding, no mask #x_length = torch.Tensor([time_dim]*batch_size).int() mask = torch.ones(size=(batch_size, 1, time_dim)).byte() output, hidden, att_probs, att_vectors = decoder( trg_inputs, encoder_hidden=encoder_states[:, -1, :], encoder_output=encoder_states, src_mask=mask, unrol_steps=time_dim, hidden=None, prev_att_vector=None) self.assertEqual(output.shape, torch.Size([batch_size, time_dim, self.vocab_size])) self.assertEqual( hidden.shape, torch.Size([self.num_layers, batch_size, self.hidden_size])) self.assertEqual(att_probs.shape, torch.Size([batch_size, time_dim, time_dim])) self.assertEqual(att_vectors.shape, torch.Size([batch_size, time_dim, self.hidden_size])) hidden_target = torch.Tensor( [[[0.5977, -0.2173, 0.0900, 0.8608, -0.3638, 0.5332, -0.5538], [0.5977, -0.2173, 0.0900, 0.8608, -0.3638, 0.5332, -0.5538]], [[-0.2767, 0.4492, -0.0656, -0.2800, 0.2594, 0.1410, 0.0101], [-0.2767, 0.4492, -0.0656, -0.2800, 0.2594, 0.1410, 0.0101]], [[0.2118, 0.2190, -0.0875, 0.2177, -0.0771, -0.1014, 0.0055], [0.2118, 0.2190, -0.0875, 0.2177, -0.0771, -0.1014, 0.0055]]]) output_target = torch.Tensor( [[[-0.2888, 0.1992, -0.1638, 0.1031, 0.3977], [-0.2917, 0.1922, -0.1755, 0.1093, 0.3963], [-0.2938, 0.1892, -0.1868, 0.1132, 0.3986], [-0.2946, 0.1885, -0.1964, 0.1155, 0.4019]], [[-0.3103, 0.2316, -0.1540, 0.0833, 0.4444], [-0.3133, 0.2251, -0.1653, 0.0898, 0.4433], [-0.3153, 0.2223, -0.1763, 0.0939, 0.4458], [-0.3160, 0.2217, -0.1856, 0.0963, 0.4492]]]) att_vectors_target = torch.Tensor( [[[-0.4831, 0.4514, 0.2072, -0.0963, -0.3155, 0.3777, 0.1536], [-0.4914, 0.4421, 0.1905, -0.1247, -0.3248, 0.3846, 0.1703], [-0.5011, 0.4363, 0.1793, -0.1462, -0.3347, 0.3919, 0.1790], [-0.5102, 0.4326, 0.1715, -0.1623, -0.3442, 0.3969, 0.1827]], [[-0.5211, 0.5055, 0.2877, 0.0200, -0.3148, 0.4124, 0.1030], [-0.5291, 0.4968, 0.2718, -0.0086, -0.3241, 0.4191, 0.1200], [-0.5384, 0.4913, 0.2610, -0.0304, -0.3340, 0.4263, 0.1288], [-0.5471, 0.4879, 0.2536, -0.0467, -0.3435, 0.4311, 0.1325]]]) self.assertTensorAlmostEqual(hidden_target, hidden) self.assertTensorAlmostEqual(output_target, output) self.assertTensorAlmostEqual(att_vectors, att_vectors_target) # att_probs should be a distribution over the output vocabulary self.assertTensorAlmostEqual(att_probs.sum(2), torch.ones(batch_size, time_dim))
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] # TODO if continue-us src_embed = PretrainedEmbeddings(src_vocab, trg_vocab, **cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) # this ties source and target embeddings # for softmax layer tying, see further below if cfg.get("tied_embeddings", False): if src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: raise ConfigurationError( "Embedding cannot be tied since vocabularies differ.") else: src_embed = PretrainedEmbeddings(src_vocab, trg_vocab, **cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ cfg["encoder"]["hidden_size"], \ "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) else: encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) # build decoder dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab) # tie softmax layer with trg embeddings """ if cfg.get("tied_softmax", False): if trg_embed.lut.weight.shape == \ model.decoder.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.decoder.output_layer.weight = trg_embed.lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer." f"shapes: output_layer.weight: {model.decoder.output_layer.weight.shape}; target_embed.lut.weight:{trg_embed.lut.weight.shape}") """ # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None, trv_vocab: Vocabulary = None, canonizer=None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :param trv_vocab: kb true value lookup vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] if "embedding_files" in cfg.keys(): #init from pretrained assert not cfg.get( "tied_embeddings", False ), "TODO implement tied embeddings along with pretrained initialization" raise NotImplementedError( "TODO implement kbsrc embed loading for embedding files") weight_tensors = [] for weight_file in cfg["embedding_files"]: with open(weight_file, "r") as f: weight = [] for line in f.readlines(): line = line.split() line = [float(x) for x in line] weight.append(line) weight = FloatTensor(weight) weight_tensors.append(weight) # Set source Embeddings to Pretrained Embeddings src_embed = Embeddings( int(weight_tensors[0][0].shape[0]), False, #TODO transformer: change to True len(weight_tensors[0]), ) src_embed.lut.weight.data = weight_tensors[0] # Set target Embeddings to Pretrained Embeddings trg_embed = Embeddings( int(weight_tensors[1][0].shape[0]), False, #TODO transformer: change to True len(weight_tensors[1]), ) trg_embed.lut.weight.data = weight_tensors[1] else: src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) if cfg.get("kb_embed_separate", False): kbsrc_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) else: kbsrc_embed = src_embed # this ties source and target embeddings # for softmax layer tying, see further below if cfg.get("tied_embeddings", False): if src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: raise ConfigurationError( "Embedding cannot be tied since vocabularies differ.") else: # Latest TODO: init embeddings with vocab_size = len(trg_vocab joined with kb_vocab) trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ cfg["encoder"]["hidden_size"], \ "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) else: encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) # retrieve kb task info kb_task = bool(cfg.get("kb", False)) k_hops = int( cfg.get("k_hops", 1) ) # k number of kvr attention layers in decoder (eric et al/default: 1) same_module_for_all_hops = bool(cfg.get("same_module_for_all_hops", False)) do_postproc = bool(cfg.get("do_postproc", True)) copy_from_source = bool(cfg.get("copy_from_source", True)) canonization_func = None if canonizer is None else canonizer( copy_from_source=copy_from_source) kb_input_feeding = bool(cfg.get("kb_input_feeding", True)) kb_feed_rnn = bool(cfg.get("kb_feed_rnn", True)) kb_multihead_feed = bool(cfg.get("kb_multihead_feed", False)) posEncKBkeys = cfg.get("posEncdKBkeys", False) tfstyletf = cfg.get("tfstyletf", True) infeedkb = bool(cfg.get("infeedkb", False)) outfeedkb = bool(cfg.get("outfeedkb", False)) add_kb_biases_to_output = bool(cfg.get("add_kb_biases_to_output", True)) kb_max_dims = cfg.get("kb_max_dims", (16, 32)) # should be tuple double_decoder = cfg.get("double_decoder", False) tied_side_softmax = cfg.get( "tied_side_softmax", False) # actually use separate linear layers, tying only the main one do_pad_kb_keys = cfg.get( "pad_kb_keys", True ) # doesnt need to be true for 1 hop (=>BIG PERFORMANCE SAVE), needs to be true for >= 2 hops if hasattr(kb_max_dims, "__iter__"): kb_max_dims = tuple(kb_max_dims) else: assert type(kb_max_dims) == int, kb_max_dims kb_max_dims = (kb_max_dims, ) assert cfg["decoder"]["hidden_size"] dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": if tfstyletf: decoder = TransformerDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout, kb_task=kb_task, kb_key_emb_size=kbsrc_embed.embedding_dim, feed_kb_hidden=kb_input_feeding, infeedkb=infeedkb, outfeedkb=outfeedkb, double_decoder=double_decoder) else: decoder = TransformerKBrnnDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout, kb_task=kb_task, k_hops=k_hops, kb_max=kb_max_dims, same_module_for_all_hops=same_module_for_all_hops, kb_key_emb_size=kbsrc_embed.embedding_dim, kb_input_feeding=kb_input_feeding, kb_feed_rnn=kb_feed_rnn, kb_multihead_feed=kb_multihead_feed) else: if not kb_task: decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: decoder = KeyValRetRNNDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout, k_hops=k_hops, kb_max=kb_max_dims, same_module_for_all_hops=same_module_for_all_hops, kb_key_emb_size=kbsrc_embed.embedding_dim, kb_input_feeding=kb_input_feeding, kb_feed_rnn=kb_feed_rnn, kb_multihead_feed=kb_multihead_feed, do_pad_kb_keys=do_pad_kb_keys) # specify generator which is mostly just the output layer generator = Generator(dec_hidden_size=cfg["decoder"]["hidden_size"], vocab_size=len(trg_vocab), add_kb_biases_to_output=add_kb_biases_to_output, double_decoder=double_decoder) model = Model( encoder=encoder, decoder=decoder, generator=generator, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab,\ kb_key_embed=kbsrc_embed,\ trv_vocab=trv_vocab, k_hops=k_hops, do_postproc=do_postproc, canonize=canonization_func, kb_att_dims=len(kb_max_dims), posEncKBkeys=posEncKBkeys ) # tie softmax layer with trg embeddings if cfg.get("tied_softmax", False): if trg_embed.lut.weight.shape == \ model.generator.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.generator.output_layer.weight = trg_embed.lut.weight if model.generator.double_decoder: # (also also) share trg embeddings and side softmax layer assert hasattr(model.generator, "side_output_layer") if tied_side_softmax: # because of distributivity this becomes O (x_1+x_2) instead of O_1 x_1 + O_2 x_2 model.generator.side_output_layer.weight = trg_embed.lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ logger.info("Building an encoder-decoder model...") src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) # this ties source and target embeddings # for softmax layer tying, see further below if cfg.get("tied_embeddings", False): if src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: raise ConfigurationError( "Embedding cannot be tied since vocabularies differ.") else: trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ cfg["encoder"]["hidden_size"], \ "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) else: encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) # build decoder dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab) # tie softmax layer with trg embeddings if cfg.get("tied_softmax", False): if trg_embed.lut.weight.shape == \ model.decoder.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.decoder.output_layer.weight = trg_embed.lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) # initialize embeddings from file pretrained_enc_embed_path = cfg["encoder"]["embeddings"].get( "load_pretrained", None) pretrained_dec_embed_path = cfg["decoder"]["embeddings"].get( "load_pretrained", None) if pretrained_enc_embed_path: logger.info("Loading pretraind src embeddings...") model.src_embed.load_from_file(pretrained_enc_embed_path, src_vocab) if pretrained_dec_embed_path and not cfg.get("tied_embeddings", False): logger.info("Loading pretraind trg embeddings...") model.trg_embed.load_from_file(pretrained_dec_embed_path, trg_vocab) logger.info("Enc-dec model built.") return model
def test_recurrent_forward(self): time_dim = 4 batch_size = 2 # make sure the outputs match the targets decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=self.encoders[0], attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, init_hidden="zero", input_feeding=False) encoder_states = torch.rand(size=(batch_size, time_dim, self.encoders[0].output_size)) trg_inputs = torch.ones(size=(batch_size, time_dim, self.emb_size)) # no padding, no mask #x_length = torch.Tensor([time_dim]*batch_size).int() mask = torch.ones(size=(batch_size, 1, time_dim)).byte() output, hidden, att_probs, att_vectors = decoder( trg_inputs, encoder_hidden=encoder_states[:, -1, :], encoder_output=encoder_states, src_mask=mask, unroll_steps=time_dim, hidden=None, prev_att_vector=None) self.assertEqual(output.shape, torch.Size( [batch_size, time_dim, self.vocab_size])) self.assertEqual(hidden.shape, torch.Size( [batch_size, self.num_layers, self.hidden_size])) self.assertEqual(att_probs.shape, torch.Size( [batch_size, time_dim, time_dim])) self.assertEqual(att_vectors.shape, torch.Size( [batch_size, time_dim, self.hidden_size])) hidden_target = torch.Tensor( [[[ 0.1814, 0.5468, -0.4717, -0.7580, 0.5834, -0.4018], [ 0.4649, 0.5484, -0.2702, 0.4545, 0.1983, 0.2771], [-0.1752, -0.4215, 0.1941, -0.3975, -0.2317, -0.5566]], [[ 0.1814, 0.5468, -0.4717, -0.7580, 0.5834, -0.4018], [ 0.4649, 0.5484, -0.2702, 0.4545, 0.1983, 0.2771], [-0.1752, -0.4215, 0.1941, -0.3975, -0.2317, -0.5566]]]) output_target = torch.Tensor( [[[ 0.2702, -0.1988, -0.1985, -0.2998, -0.2564], [ 0.2719, -0.2075, -0.2017, -0.2988, -0.2595], [ 0.2720, -0.2143, -0.2084, -0.3024, -0.2537], [ 0.2714, -0.2183, -0.2135, -0.3061, -0.2468]], [[ 0.2757, -0.1744, -0.1888, -0.3038, -0.2466], [ 0.2782, -0.1837, -0.1928, -0.3028, -0.2505], [ 0.2785, -0.1904, -0.1994, -0.3066, -0.2448], [ 0.2777, -0.1943, -0.2042, -0.3105, -0.2379]]]) att_vectors_target = torch.Tensor( [[[-0.6196, -0.0505, 0.4900, 0.6286, -0.5007, -0.3721], [-0.6389, -0.0337, 0.4998, 0.6458, -0.5052, -0.3579], [-0.6396, -0.0158, 0.5058, 0.6609, -0.5035, -0.3660], [-0.6348, -0.0017, 0.5090, 0.6719, -0.5013, -0.3771]], [[-0.5697, -0.0887, 0.4515, 0.6128, -0.4713, -0.4068], [-0.5910, -0.0721, 0.4617, 0.6305, -0.4760, -0.3930], [-0.5918, -0.0544, 0.4680, 0.6461, -0.4741, -0.4008], [-0.5866, -0.0405, 0.4712, 0.6574, -0.4718, -0.4116]]]) self.assertTensorAlmostEqual(hidden_target, hidden) self.assertTensorAlmostEqual(output_target, output) self.assertTensorAlmostEqual(att_vectors, att_vectors_target) # att_probs should be a distribution over the output vocabulary self.assertTensorAlmostEqual(att_probs.sum(2), torch.ones(batch_size, time_dim))
def build_pretrained_model(cfg: dict = None, pretrained_model: Model = None, pretrained_src_vocab: Vocabulary = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) embedding_matrix = np.zeros((len(src_vocab), src_embed.embedding_dim)) unknown_words = [] for w in pretrained_src_vocab.itos: try: pre_ix = pretrained_src_vocab.stoi[w] ix = src_vocab.stoi[w] embedding_matrix[ix] = pretrained_model.src_embed.lut.weight[ pre_ix].cpu().detach().numpy() except KeyError: unknown_words.append(w) src_embed.lut.weight = torch.nn.Parameter( torch.tensor(embedding_matrix, dtype=torch.float32)) trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) # build decoder dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) encoder = pretrained_model.encoder encoder.train() set_requires_grad(encoder, True) # build encoder #enc_dropout = cfg["encoder"].get("dropout", 0.) #enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) #if cfg["encoder"].get("type", "recurrent") == "transformer": # assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ # cfg["encoder"]["hidden_size"], \ # "for transformer, emb_size must be hidden_size" # encoder = TransformerEncoder(**cfg["encoder"], # emb_size=src_embed.embedding_dim, # emb_dropout=enc_emb_dropout) #else: # encoder = RecurrentEncoder(**cfg["encoder"], # emb_size=src_embed.embedding_dim, # emb_dropout=enc_emb_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=pretrained_model.src_vocab, trg_vocab=trg_vocab) # tie softmax layer with trg embeddings if cfg.get("tied_softmax", False): if trg_embed.lut.weight.shape == \ model.decoder.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.decoder.output_layer.weight = trg_embed.lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters #initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def test_recurrent_forward(self): time_dim = 4 batch_size = 2 # make sure the outputs match the targets decoder = RecurrentDecoder( hidden_size=self.hidden_size, encoder_output_size=self.encoders[0].output_size, attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, init_hidden="zero", input_feeding=False) encoder_states = torch.rand(size=(batch_size, time_dim, self.encoders[0].output_size)) trg_inputs = torch.ones(size=(batch_size, time_dim, self.emb_size)) # no padding, no mask #x_length = torch.Tensor([time_dim]*batch_size).int() mask = torch.ones(size=(batch_size, 1, time_dim)).byte() output, hidden, att_probs, att_vectors = decoder( trg_inputs, encoder_hidden=encoder_states[:, -1, :], encoder_output=encoder_states, src_mask=mask, unroll_steps=time_dim, hidden=None, prev_att_vector=None) att_probs = att_probs["src_trg"] self.assertEqual(output.shape, torch.Size([batch_size, time_dim, self.vocab_size])) self.assertEqual( hidden.shape, torch.Size([self.num_layers, batch_size, self.hidden_size])) self.assertEqual(att_probs.shape, torch.Size([batch_size, time_dim, time_dim])) self.assertEqual(att_vectors.shape, torch.Size([batch_size, time_dim, self.hidden_size])) hidden_target = torch.Tensor( [[[-0.4330, 0.0563, -0.3310, 0.4228, -0.1188, -0.0436], [-0.4330, 0.0563, -0.3310, 0.4228, -0.1188, -0.0436]], [[0.1796, -0.0573, 0.3581, -0.0051, -0.3506, 0.2007], [0.1796, -0.0573, 0.3581, -0.0051, -0.3506, 0.2007]], [[-0.1954, -0.2804, -0.1885, -0.2336, -0.4033, 0.0890], [-0.1954, -0.2804, -0.1885, -0.2336, -0.4033, 0.0890]]]) output_target = torch.Tensor( [[[-0.1533, 0.1284, -0.1100, -0.0350, -0.1126], [-0.1260, 0.1000, -0.1006, -0.0328, -0.0942], [-0.1052, 0.0845, -0.0984, -0.0327, -0.0839], [-0.0899, 0.0753, -0.0986, -0.0330, -0.0779]], [[-0.1302, 0.1310, -0.0881, -0.0362, -0.1239], [-0.1026, 0.1024, -0.0786, -0.0340, -0.1054], [-0.0817, 0.0867, -0.0765, -0.0339, -0.0951], [-0.0663, 0.0775, -0.0766, -0.0343, -0.0890]]]) att_vectors_target = torch.Tensor( [[[-0.0351, 0.1532, 0.0301, -0.1575, 0.0526, -0.2428], [-0.0727, 0.1208, 0.0664, -0.1267, 0.0610, -0.2101], [-0.0964, 0.0932, 0.0850, -0.1058, 0.0717, -0.1949], [-0.1115, 0.0725, 0.0942, -0.0914, 0.0810, -0.1871]], [[0.0667, 0.1424, -0.1167, -0.1500, -0.0087, -0.2175], [0.0290, 0.1099, -0.0807, -0.1191, -0.0004, -0.1845], [0.0052, 0.0821, -0.0619, -0.0981, 0.0103, -0.1691], [-0.0101, 0.0614, -0.0527, -0.0836, 0.0195, -0.1613]]]) self.assertTensorAlmostEqual(hidden_target, hidden) self.assertTensorAlmostEqual(output_target, output) self.assertTensorAlmostEqual(att_vectors, att_vectors_target) # att_probs should be a distribution over the output vocabulary self.assertTensorAlmostEqual(att_probs.sum(2), torch.ones(batch_size, time_dim))
def build_unsupervised_nmt_model( cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> UnsupervisedNMTModel: """ Build an UnsupervisedNMTModel. :param cfg: model configuration :param src_vocab: Vocabulary for the src language :param trg_vocab: Vocabulary for the trg language :return: Unsupervised NMT model as specified in cfg """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] # build source and target embedding layers # embeddings in the encoder are pretrained and stay fixed loaded_src_embed = PretrainedEmbeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx, vocab=src_vocab, freeze=True) loaded_trg_embed = PretrainedEmbeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx, vocab=trg_vocab, freeze=True) # embeddings in the decoder are randomly initialised and will be learned src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx, freeze=False) trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx, freeze=False) # build shared encoder enc_dropout = cfg["encoder"].get("dropout", 0.) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ cfg["encoder"]["hidden_size"], \ "for transformer, emb_size must be hidden_size" shared_encoder = TransformerEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) else: shared_encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) # build src and trg language decoder dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": src_decoder = TransformerDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(src_vocab), emb_size=src_embed.embedding_dim, emb_dropout=dec_emb_dropout) trg_decoder = TransformerDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: src_decoder = RecurrentDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(src_vocab), emb_size=src_embed.embedding_dim, emb_dropout=dec_emb_dropout) trg_decoder = RecurrentDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) # build unsupervised NMT model model = UnsupervisedNMTModel(loaded_src_embed, loaded_trg_embed, src_embed, trg_embed, shared_encoder, src_decoder, trg_decoder, src_vocab, trg_vocab) # initialise model # embed_initializer should be none so loaded encoder embeddings won't be overwritten initialize_model(model.src2src_translator, cfg, src_padding_idx, src_padding_idx) initialize_model(model.src2trg_translator, cfg, src_padding_idx, trg_padding_idx) initialize_model(model.trg2src_translator, cfg, trg_padding_idx, src_padding_idx) initialize_model(model.trg2src_translator, cfg, trg_padding_idx, trg_padding_idx) return model