Esempio n. 1
0
    def _build(self, batch_size):
        src_time_dim = 4
        vocab_size = 7

        emb = Embeddings(embedding_dim=self.emb_size,
                         vocab_size=vocab_size,
                         padding_idx=self.pad_index)

        encoder = RecurrentEncoder(emb_size=self.emb_size,
                                   num_layers=self.num_layers,
                                   hidden_size=self.encoder_hidden_size,
                                   bidirectional=True)

        decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                   encoder=encoder,
                                   attention="bahdanau",
                                   emb_size=self.emb_size,
                                   vocab_size=self.vocab_size,
                                   num_layers=self.num_layers,
                                   init_hidden="bridge",
                                   input_feeding=True)

        encoder_output = torch.rand(size=(batch_size, src_time_dim,
                                          encoder.output_size))

        for p in decoder.parameters():
            torch.nn.init.uniform_(p, -0.5, 0.5)

        src_mask = torch.ones(size=(batch_size, 1, src_time_dim)) == 1

        encoder_hidden = torch.rand(size=(batch_size, encoder.output_size))

        return src_mask, emb, decoder, encoder_output, encoder_hidden
Esempio n. 2
0
 def test_recurrent_freeze(self):
     decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                encoder=self.encoders[0],
                                attention="bahdanau",
                                emb_size=self.emb_size,
                                vocab_size=self.vocab_size,
                                num_layers=self.num_layers,
                                bridge=False,
                                input_feeding=False,
                                freeze=True)
     for n, p in decoder.named_parameters():
         self.assertFalse(p.requires_grad)
Esempio n. 3
0
    def test_recurrent_decoder_size(self):
        # test all combinations of bridge, input_feeding, encoder directions
        for encoder in self.encoders:
            for bridge in [True, False]:
                for input_feeding in [True, False]:
                    decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                               encoder=encoder,
                                               attention="bahdanau",
                                               emb_size=self.emb_size,
                                               vocab_size=self.vocab_size,
                                               num_layers=self.num_layers,
                                               bridge=bridge,
                                               input_feeding=input_feeding)
                    self.assertEqual(decoder.rnn.hidden_size, self.hidden_size)
                    self.assertEqual(decoder.att_vector_layer.out_features,
                                     self.hidden_size)
                    self.assertEqual(decoder.output_layer.out_features,
                                     self.vocab_size)
                    self.assertEqual(decoder.output_size, self.vocab_size)
                    self.assertEqual(decoder.rnn.bidirectional, False)

                    if bridge:
                        self.assertTrue(decoder.bridge)
                        self.assertTrue(hasattr(decoder, "bridge_layer"))
                        self.assertEqual(decoder.bridge_layer.out_features,
                                         self.hidden_size)
                    else:
                        self.assertFalse(decoder.bridge)
                        self.assertFalse(hasattr(decoder, "bridge_layer"))

                    if input_feeding:
                        self.assertEqual(decoder.rnn_input_size,
                                         self.emb_size + self.hidden_size)
                    else:
                        self.assertEqual(decoder.rnn_input_size, self.emb_size)
Esempio n. 4
0
 def test_recurrent_decoder_type(self):
     valid_rnn_types = {"gru": GRU, "lstm": LSTM}
     for name, obj in valid_rnn_types.items():
         decoder = RecurrentDecoder(rnn_type=name,
                                    hidden_size=self.hidden_size,
                                    encoder=self.encoders[0],
                                    attention="bahdanau",
                                    emb_size=self.emb_size,
                                    vocab_size=self.vocab_size,
                                    num_layers=self.num_layers,
                                    bridge=False,
                                    input_feeding=False)
         self.assertEqual(type(decoder.rnn), obj)
Esempio n. 5
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    encoder = RecurrentEncoder(**cfg["encoder"],
                               emb_size=src_embed.embedding_dim)
    decoder = RecurrentDecoder(**cfg["decoder"],
                               encoder=encoder,
                               vocab_size=len(trg_vocab),
                               emb_size=trg_embed.embedding_dim)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Esempio n. 6
0
    def test_recurrent_decoder_size(self):
        # test all combinations of bridge, input_feeding, encoder directions
        for encoder in self.encoders:
            for init_hidden in ["bridge", "zero", "last"]:
                for input_feeding in [True, False]:
                    decoder = RecurrentDecoder(
                        hidden_size=self.hidden_size,
                        encoder_output_size=encoder.output_size,
                        attention="bahdanau",
                        emb_size=self.emb_size,
                        vocab_size=self.vocab_size,
                        num_layers=self.num_layers,
                        init_hidden=init_hidden,
                        input_feeding=input_feeding)
                    self.assertEqual(decoder.rnn.hidden_size, self.hidden_size)
                    self.assertEqual(decoder.att_vector_layer.out_features,
                                     self.hidden_size)
                    self.assertEqual(
                        decoder.output_layers["vocab"].out_features,
                        self.vocab_size)
                    self.assertEqual(decoder.vocab_size, self.vocab_size)
                    self.assertEqual(decoder.rnn.bidirectional, False)

                    self.assertEqual(decoder.init_hidden_option, init_hidden)
                    if init_hidden == "bridge":
                        self.assertTrue(hasattr(decoder, "bridge_layer"))
                        self.assertEqual(decoder.bridge_layer[0].out_features,
                                         self.hidden_size)
                        self.assertEqual(decoder.bridge_layer[0].in_features,
                                         encoder.output_size)
                    else:
                        self.assertFalse(decoder.bridge_layer is not None)

                    if input_feeding:
                        self.assertEqual(decoder.rnn_input_size,
                                         self.emb_size + self.hidden_size)
                    else:
                        self.assertEqual(decoder.rnn_input_size, self.emb_size)
Esempio n. 7
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None):
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    if cfg.get("tied_embeddings", False) \
        and src_vocab.itos == trg_vocab.itos:
        # share embeddings for src and trg
        trg_embed = src_embed
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    encoder = RecurrentEncoder(**cfg["encoder"],
                               emb_size=src_embed.embedding_dim)
    decoder = RecurrentDecoder(**cfg["decoder"],
                               encoder=encoder,
                               vocab_size=len(trg_vocab),
                               emb_size=trg_embed.embedding_dim)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Esempio n. 8
0
    def test_recurrent_input_dropout(self):
        drop_prob = 0.5
        decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                   encoder=self.encoders[0],
                                   attention="bahdanau",
                                   emb_size=self.emb_size,
                                   vocab_size=self.vocab_size,
                                   num_layers=self.num_layers,
                                   bridge=False,
                                   input_feeding=False,
                                   dropout=drop_prob)
        input_tensor = torch.Tensor([2, 3, 1, -1])
        decoder.train()
        dropped = decoder.rnn_input_dropout(input=input_tensor)
        # eval switches off dropout
        decoder.eval()
        no_drop = decoder.rnn_input_dropout(input=input_tensor)
        # when dropout is applied, remaining values are divided by drop_prob
        self.assertGreaterEqual((no_drop - (drop_prob * dropped)).abs().sum(),
                                0)

        drop_prob = 1.0
        decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                   encoder=self.encoders[0],
                                   attention="bahdanau",
                                   emb_size=self.emb_size,
                                   vocab_size=self.vocab_size,
                                   num_layers=self.num_layers,
                                   bridge=False,
                                   input_feeding=False,
                                   dropout=drop_prob)
        all_dropped = decoder.rnn_input_dropout(input=input_tensor)
        self.assertEqual(all_dropped.sum(), 0)
        decoder.eval()
        none_dropped = decoder.rnn_input_dropout(input=input_tensor)
        self.assertTensorEqual(no_drop, none_dropped)
        self.assertTensorEqual((no_drop - all_dropped), no_drop)
Esempio n. 9
0
 def test_recurrent_forward(self):
     time_dim = 4
     batch_size = 2
     # make sure the outputs match the targets
     decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                encoder=self.encoders[0],
                                attention="bahdanau",
                                emb_size=self.emb_size,
                                vocab_size=self.vocab_size,
                                num_layers=self.num_layers,
                                bridge=False,
                                input_feeding=False)
     encoder_states = torch.rand(size=(batch_size, time_dim,
                                       self.encoders[0].output_size))
     trg_inputs = torch.ones(size=(batch_size, time_dim, self.emb_size))
     # no padding, no mask
     #x_length = torch.Tensor([time_dim]*batch_size).int()
     mask = torch.ones(size=(batch_size, 1, time_dim)).byte()
     output, hidden, att_probs, att_vectors = decoder(
         trg_inputs,
         encoder_hidden=encoder_states[:, -1, :],
         encoder_output=encoder_states,
         src_mask=mask,
         unrol_steps=time_dim,
         hidden=None,
         prev_att_vector=None)
     self.assertEqual(output.shape,
                      torch.Size([batch_size, time_dim, self.vocab_size]))
     self.assertEqual(
         hidden.shape,
         torch.Size([self.num_layers, batch_size, self.hidden_size]))
     self.assertEqual(att_probs.shape,
                      torch.Size([batch_size, time_dim, time_dim]))
     self.assertEqual(att_vectors.shape,
                      torch.Size([batch_size, time_dim, self.hidden_size]))
     hidden_target = torch.Tensor(
         [[[0.5977, -0.2173, 0.0900, 0.8608, -0.3638, 0.5332, -0.5538],
           [0.5977, -0.2173, 0.0900, 0.8608, -0.3638, 0.5332, -0.5538]],
          [[-0.2767, 0.4492, -0.0656, -0.2800, 0.2594, 0.1410, 0.0101],
           [-0.2767, 0.4492, -0.0656, -0.2800, 0.2594, 0.1410, 0.0101]],
          [[0.2118, 0.2190, -0.0875, 0.2177, -0.0771, -0.1014, 0.0055],
           [0.2118, 0.2190, -0.0875, 0.2177, -0.0771, -0.1014, 0.0055]]])
     output_target = torch.Tensor(
         [[[-0.2888, 0.1992, -0.1638, 0.1031, 0.3977],
           [-0.2917, 0.1922, -0.1755, 0.1093, 0.3963],
           [-0.2938, 0.1892, -0.1868, 0.1132, 0.3986],
           [-0.2946, 0.1885, -0.1964, 0.1155, 0.4019]],
          [[-0.3103, 0.2316, -0.1540, 0.0833, 0.4444],
           [-0.3133, 0.2251, -0.1653, 0.0898, 0.4433],
           [-0.3153, 0.2223, -0.1763, 0.0939, 0.4458],
           [-0.3160, 0.2217, -0.1856, 0.0963, 0.4492]]])
     att_vectors_target = torch.Tensor(
         [[[-0.4831, 0.4514, 0.2072, -0.0963, -0.3155, 0.3777, 0.1536],
           [-0.4914, 0.4421, 0.1905, -0.1247, -0.3248, 0.3846, 0.1703],
           [-0.5011, 0.4363, 0.1793, -0.1462, -0.3347, 0.3919, 0.1790],
           [-0.5102, 0.4326, 0.1715, -0.1623, -0.3442, 0.3969, 0.1827]],
          [[-0.5211, 0.5055, 0.2877, 0.0200, -0.3148, 0.4124, 0.1030],
           [-0.5291, 0.4968, 0.2718, -0.0086, -0.3241, 0.4191, 0.1200],
           [-0.5384, 0.4913, 0.2610, -0.0304, -0.3340, 0.4263, 0.1288],
           [-0.5471, 0.4879, 0.2536, -0.0467, -0.3435, 0.4311, 0.1325]]])
     self.assertTensorAlmostEqual(hidden_target, hidden)
     self.assertTensorAlmostEqual(output_target, output)
     self.assertTensorAlmostEqual(att_vectors, att_vectors_target)
     # att_probs should be a distribution over the output vocabulary
     self.assertTensorAlmostEqual(att_probs.sum(2),
                                  torch.ones(batch_size, time_dim))
Esempio n. 10
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    # TODO if continue-us
    src_embed = PretrainedEmbeddings(src_vocab,
                                     trg_vocab,
                                     **cfg["encoder"]["embeddings"],
                                     vocab_size=len(src_vocab),
                                     padding_idx=src_padding_idx)

    # this ties source and target embeddings
    # for softmax layer tying, see further below
    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        src_embed = PretrainedEmbeddings(src_vocab,
                                         trg_vocab,
                                         **cfg["encoder"]["embeddings"],
                                         vocab_size=len(src_vocab),
                                         padding_idx=src_padding_idx)

    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)
    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    """
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer."
                f"shapes: output_layer.weight: {model.decoder.output_layer.weight.shape}; target_embed.lut.weight:{trg_embed.lut.weight.shape}")
    """
    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Esempio n. 11
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None,
                trv_vocab: Vocabulary = None,
                canonizer=None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :param trv_vocab: kb true value lookup vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    if "embedding_files" in cfg.keys():  #init from pretrained
        assert not cfg.get(
            "tied_embeddings", False
        ), "TODO implement tied embeddings along with pretrained initialization"
        raise NotImplementedError(
            "TODO implement kbsrc embed loading for embedding files")
        weight_tensors = []
        for weight_file in cfg["embedding_files"]:
            with open(weight_file, "r") as f:
                weight = []
                for line in f.readlines():
                    line = line.split()
                    line = [float(x) for x in line]
                    weight.append(line)

            weight = FloatTensor(weight)
            weight_tensors.append(weight)
        # Set source Embeddings to Pretrained Embeddings
        src_embed = Embeddings(
            int(weight_tensors[0][0].shape[0]),
            False,  #TODO transformer: change to True
            len(weight_tensors[0]),
        )
        src_embed.lut.weight.data = weight_tensors[0]

        # Set target Embeddings to Pretrained Embeddings
        trg_embed = Embeddings(
            int(weight_tensors[1][0].shape[0]),
            False,  #TODO transformer: change to True
            len(weight_tensors[1]),
        )
        trg_embed.lut.weight.data = weight_tensors[1]
    else:
        src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                               vocab_size=len(src_vocab),
                               padding_idx=src_padding_idx)
        if cfg.get("kb_embed_separate", False):
            kbsrc_embed = Embeddings(**cfg["encoder"]["embeddings"],
                                     vocab_size=len(src_vocab),
                                     padding_idx=src_padding_idx)
        else:
            kbsrc_embed = src_embed

        # this ties source and target embeddings
        # for softmax layer tying, see further below
        if cfg.get("tied_embeddings", False):
            if src_vocab.itos == trg_vocab.itos:
                # share embeddings for src and trg
                trg_embed = src_embed
            else:
                raise ConfigurationError(
                    "Embedding cannot be tied since vocabularies differ.")
        else:
            # Latest TODO: init embeddings with vocab_size = len(trg_vocab joined with kb_vocab)
            trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                                   vocab_size=len(trg_vocab),
                                   padding_idx=trg_padding_idx)
    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # retrieve kb task info
    kb_task = bool(cfg.get("kb", False))
    k_hops = int(
        cfg.get("k_hops", 1)
    )  # k number of kvr attention layers in decoder (eric et al/default: 1)
    same_module_for_all_hops = bool(cfg.get("same_module_for_all_hops", False))
    do_postproc = bool(cfg.get("do_postproc", True))
    copy_from_source = bool(cfg.get("copy_from_source", True))
    canonization_func = None if canonizer is None else canonizer(
        copy_from_source=copy_from_source)
    kb_input_feeding = bool(cfg.get("kb_input_feeding", True))
    kb_feed_rnn = bool(cfg.get("kb_feed_rnn", True))
    kb_multihead_feed = bool(cfg.get("kb_multihead_feed", False))
    posEncKBkeys = cfg.get("posEncdKBkeys", False)
    tfstyletf = cfg.get("tfstyletf", True)
    infeedkb = bool(cfg.get("infeedkb", False))
    outfeedkb = bool(cfg.get("outfeedkb", False))
    add_kb_biases_to_output = bool(cfg.get("add_kb_biases_to_output", True))
    kb_max_dims = cfg.get("kb_max_dims", (16, 32))  # should be tuple
    double_decoder = cfg.get("double_decoder", False)
    tied_side_softmax = cfg.get(
        "tied_side_softmax",
        False)  # actually use separate linear layers, tying only the main one
    do_pad_kb_keys = cfg.get(
        "pad_kb_keys", True
    )  # doesnt need to be true for 1 hop (=>BIG PERFORMANCE SAVE), needs to be true for >= 2 hops

    if hasattr(kb_max_dims, "__iter__"):
        kb_max_dims = tuple(kb_max_dims)
    else:
        assert type(kb_max_dims) == int, kb_max_dims
        kb_max_dims = (kb_max_dims, )

    assert cfg["decoder"]["hidden_size"]
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)

    if cfg["decoder"].get("type", "recurrent") == "transformer":
        if tfstyletf:
            decoder = TransformerDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                kb_task=kb_task,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                feed_kb_hidden=kb_input_feeding,
                infeedkb=infeedkb,
                outfeedkb=outfeedkb,
                double_decoder=double_decoder)
        else:
            decoder = TransformerKBrnnDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                kb_task=kb_task,
                k_hops=k_hops,
                kb_max=kb_max_dims,
                same_module_for_all_hops=same_module_for_all_hops,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                kb_input_feeding=kb_input_feeding,
                kb_feed_rnn=kb_feed_rnn,
                kb_multihead_feed=kb_multihead_feed)
    else:
        if not kb_task:
            decoder = RecurrentDecoder(**cfg["decoder"],
                                       encoder=encoder,
                                       vocab_size=len(trg_vocab),
                                       emb_size=trg_embed.embedding_dim,
                                       emb_dropout=dec_emb_dropout)
        else:
            decoder = KeyValRetRNNDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                k_hops=k_hops,
                kb_max=kb_max_dims,
                same_module_for_all_hops=same_module_for_all_hops,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                kb_input_feeding=kb_input_feeding,
                kb_feed_rnn=kb_feed_rnn,
                kb_multihead_feed=kb_multihead_feed,
                do_pad_kb_keys=do_pad_kb_keys)

    # specify generator which is mostly just the output layer
    generator = Generator(dec_hidden_size=cfg["decoder"]["hidden_size"],
                          vocab_size=len(trg_vocab),
                          add_kb_biases_to_output=add_kb_biases_to_output,
                          double_decoder=double_decoder)

    model = Model(
                  encoder=encoder, decoder=decoder, generator=generator,
                  src_embed=src_embed, trg_embed=trg_embed,
                  src_vocab=src_vocab, trg_vocab=trg_vocab,\
                  kb_key_embed=kbsrc_embed,\
                  trv_vocab=trv_vocab,
                  k_hops=k_hops,
                  do_postproc=do_postproc,
                  canonize=canonization_func,
                  kb_att_dims=len(kb_max_dims),
                  posEncKBkeys=posEncKBkeys
                  )

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.generator.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.generator.output_layer.weight = trg_embed.lut.weight
            if model.generator.double_decoder:
                # (also also) share trg embeddings and side softmax layer
                assert hasattr(model.generator, "side_output_layer")
                if tied_side_softmax:
                    # because of distributivity this becomes O (x_1+x_2) instead of O_1 x_1 + O_2 x_2
                    model.generator.side_output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Esempio n. 12
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    logger.info("Building an encoder-decoder model...")
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    # this ties source and target embeddings
    # for softmax layer tying, see further below
    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)
    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    # initialize embeddings from file
    pretrained_enc_embed_path = cfg["encoder"]["embeddings"].get(
        "load_pretrained", None)
    pretrained_dec_embed_path = cfg["decoder"]["embeddings"].get(
        "load_pretrained", None)
    if pretrained_enc_embed_path:
        logger.info("Loading pretraind src embeddings...")
        model.src_embed.load_from_file(pretrained_enc_embed_path, src_vocab)
    if pretrained_dec_embed_path and not cfg.get("tied_embeddings", False):
        logger.info("Loading pretraind trg embeddings...")
        model.trg_embed.load_from_file(pretrained_dec_embed_path, trg_vocab)

    logger.info("Enc-dec model built.")
    return model
Esempio n. 13
0
    def test_recurrent_forward(self):
        time_dim = 4
        batch_size = 2
        # make sure the outputs match the targets
        decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                   encoder=self.encoders[0],
                                   attention="bahdanau",
                                   emb_size=self.emb_size,
                                   vocab_size=self.vocab_size,
                                   num_layers=self.num_layers,
                                   init_hidden="zero",
                                   input_feeding=False)
        encoder_states = torch.rand(size=(batch_size, time_dim,
                                          self.encoders[0].output_size))
        trg_inputs = torch.ones(size=(batch_size, time_dim, self.emb_size))
        # no padding, no mask
        #x_length = torch.Tensor([time_dim]*batch_size).int()
        mask = torch.ones(size=(batch_size, 1, time_dim)).byte()
        output, hidden, att_probs, att_vectors = decoder(
            trg_inputs, encoder_hidden=encoder_states[:, -1, :],
            encoder_output=encoder_states, src_mask=mask, unroll_steps=time_dim,
            hidden=None, prev_att_vector=None)
        self.assertEqual(output.shape, torch.Size(
            [batch_size, time_dim, self.vocab_size]))
        self.assertEqual(hidden.shape, torch.Size(
            [batch_size, self.num_layers, self.hidden_size]))
        self.assertEqual(att_probs.shape, torch.Size(
            [batch_size, time_dim, time_dim]))
        self.assertEqual(att_vectors.shape, torch.Size(
            [batch_size, time_dim, self.hidden_size]))
        hidden_target = torch.Tensor(
            [[[ 0.1814,  0.5468, -0.4717, -0.7580,  0.5834, -0.4018],
              [ 0.4649,  0.5484, -0.2702,  0.4545,  0.1983,  0.2771],
              [-0.1752, -0.4215,  0.1941, -0.3975, -0.2317, -0.5566]],

             [[ 0.1814,  0.5468, -0.4717, -0.7580,  0.5834, -0.4018],
              [ 0.4649,  0.5484, -0.2702,  0.4545,  0.1983,  0.2771],
              [-0.1752, -0.4215,  0.1941, -0.3975, -0.2317, -0.5566]]])
        output_target = torch.Tensor(
            [[[ 0.2702, -0.1988, -0.1985, -0.2998, -0.2564],
             [ 0.2719, -0.2075, -0.2017, -0.2988, -0.2595],
             [ 0.2720, -0.2143, -0.2084, -0.3024, -0.2537],
             [ 0.2714, -0.2183, -0.2135, -0.3061, -0.2468]],

            [[ 0.2757, -0.1744, -0.1888, -0.3038, -0.2466],
             [ 0.2782, -0.1837, -0.1928, -0.3028, -0.2505],
             [ 0.2785, -0.1904, -0.1994, -0.3066, -0.2448],
             [ 0.2777, -0.1943, -0.2042, -0.3105, -0.2379]]])
        att_vectors_target = torch.Tensor(
            [[[-0.6196, -0.0505,  0.4900,  0.6286, -0.5007, -0.3721],
             [-0.6389, -0.0337,  0.4998,  0.6458, -0.5052, -0.3579],
             [-0.6396, -0.0158,  0.5058,  0.6609, -0.5035, -0.3660],
             [-0.6348, -0.0017,  0.5090,  0.6719, -0.5013, -0.3771]],

            [[-0.5697, -0.0887,  0.4515,  0.6128, -0.4713, -0.4068],
             [-0.5910, -0.0721,  0.4617,  0.6305, -0.4760, -0.3930],
             [-0.5918, -0.0544,  0.4680,  0.6461, -0.4741, -0.4008],
             [-0.5866, -0.0405,  0.4712,  0.6574, -0.4718, -0.4116]]])
        self.assertTensorAlmostEqual(hidden_target, hidden)
        self.assertTensorAlmostEqual(output_target, output)
        self.assertTensorAlmostEqual(att_vectors, att_vectors_target)
        # att_probs should be a distribution over the output vocabulary
        self.assertTensorAlmostEqual(att_probs.sum(2),
                                     torch.ones(batch_size, time_dim))
Esempio n. 14
0
def build_pretrained_model(cfg: dict = None,
                           pretrained_model: Model = None,
                           pretrained_src_vocab: Vocabulary = None,
                           src_vocab: Vocabulary = None,
                           trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    embedding_matrix = np.zeros((len(src_vocab), src_embed.embedding_dim))
    unknown_words = []
    for w in pretrained_src_vocab.itos:
        try:
            pre_ix = pretrained_src_vocab.stoi[w]
            ix = src_vocab.stoi[w]
            embedding_matrix[ix] = pretrained_model.src_embed.lut.weight[
                pre_ix].cpu().detach().numpy()
        except KeyError:
            unknown_words.append(w)

    src_embed.lut.weight = torch.nn.Parameter(
        torch.tensor(embedding_matrix, dtype=torch.float32))

    trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                           vocab_size=len(trg_vocab),
                           padding_idx=trg_padding_idx)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)

    encoder = pretrained_model.encoder
    encoder.train()
    set_requires_grad(encoder, True)

    # build encoder
    #enc_dropout = cfg["encoder"].get("dropout", 0.)
    #enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    #if cfg["encoder"].get("type", "recurrent") == "transformer":
    #    assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
    #           cfg["encoder"]["hidden_size"], \
    #           "for transformer, emb_size must be hidden_size"

    #    encoder = TransformerEncoder(**cfg["encoder"],
    #                                 emb_size=src_embed.embedding_dim,
    #                                 emb_dropout=enc_emb_dropout)
    #else:
    #    encoder = RecurrentEncoder(**cfg["encoder"],
    #                               emb_size=src_embed.embedding_dim,
    #                               emb_dropout=enc_emb_dropout)

    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=pretrained_model.src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    #initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Esempio n. 15
0
 def test_recurrent_forward(self):
     time_dim = 4
     batch_size = 2
     # make sure the outputs match the targets
     decoder = RecurrentDecoder(
         hidden_size=self.hidden_size,
         encoder_output_size=self.encoders[0].output_size,
         attention="bahdanau",
         emb_size=self.emb_size,
         vocab_size=self.vocab_size,
         num_layers=self.num_layers,
         init_hidden="zero",
         input_feeding=False)
     encoder_states = torch.rand(size=(batch_size, time_dim,
                                       self.encoders[0].output_size))
     trg_inputs = torch.ones(size=(batch_size, time_dim, self.emb_size))
     # no padding, no mask
     #x_length = torch.Tensor([time_dim]*batch_size).int()
     mask = torch.ones(size=(batch_size, 1, time_dim)).byte()
     output, hidden, att_probs, att_vectors = decoder(
         trg_inputs,
         encoder_hidden=encoder_states[:, -1, :],
         encoder_output=encoder_states,
         src_mask=mask,
         unroll_steps=time_dim,
         hidden=None,
         prev_att_vector=None)
     att_probs = att_probs["src_trg"]
     self.assertEqual(output.shape,
                      torch.Size([batch_size, time_dim, self.vocab_size]))
     self.assertEqual(
         hidden.shape,
         torch.Size([self.num_layers, batch_size, self.hidden_size]))
     self.assertEqual(att_probs.shape,
                      torch.Size([batch_size, time_dim, time_dim]))
     self.assertEqual(att_vectors.shape,
                      torch.Size([batch_size, time_dim, self.hidden_size]))
     hidden_target = torch.Tensor(
         [[[-0.4330, 0.0563, -0.3310, 0.4228, -0.1188, -0.0436],
           [-0.4330, 0.0563, -0.3310, 0.4228, -0.1188, -0.0436]],
          [[0.1796, -0.0573, 0.3581, -0.0051, -0.3506, 0.2007],
           [0.1796, -0.0573, 0.3581, -0.0051, -0.3506, 0.2007]],
          [[-0.1954, -0.2804, -0.1885, -0.2336, -0.4033, 0.0890],
           [-0.1954, -0.2804, -0.1885, -0.2336, -0.4033, 0.0890]]])
     output_target = torch.Tensor(
         [[[-0.1533, 0.1284, -0.1100, -0.0350, -0.1126],
           [-0.1260, 0.1000, -0.1006, -0.0328, -0.0942],
           [-0.1052, 0.0845, -0.0984, -0.0327, -0.0839],
           [-0.0899, 0.0753, -0.0986, -0.0330, -0.0779]],
          [[-0.1302, 0.1310, -0.0881, -0.0362, -0.1239],
           [-0.1026, 0.1024, -0.0786, -0.0340, -0.1054],
           [-0.0817, 0.0867, -0.0765, -0.0339, -0.0951],
           [-0.0663, 0.0775, -0.0766, -0.0343, -0.0890]]])
     att_vectors_target = torch.Tensor(
         [[[-0.0351, 0.1532, 0.0301, -0.1575, 0.0526, -0.2428],
           [-0.0727, 0.1208, 0.0664, -0.1267, 0.0610, -0.2101],
           [-0.0964, 0.0932, 0.0850, -0.1058, 0.0717, -0.1949],
           [-0.1115, 0.0725, 0.0942, -0.0914, 0.0810, -0.1871]],
          [[0.0667, 0.1424, -0.1167, -0.1500, -0.0087, -0.2175],
           [0.0290, 0.1099, -0.0807, -0.1191, -0.0004, -0.1845],
           [0.0052, 0.0821, -0.0619, -0.0981, 0.0103, -0.1691],
           [-0.0101, 0.0614, -0.0527, -0.0836, 0.0195, -0.1613]]])
     self.assertTensorAlmostEqual(hidden_target, hidden)
     self.assertTensorAlmostEqual(output_target, output)
     self.assertTensorAlmostEqual(att_vectors, att_vectors_target)
     # att_probs should be a distribution over the output vocabulary
     self.assertTensorAlmostEqual(att_probs.sum(2),
                                  torch.ones(batch_size, time_dim))
Esempio n. 16
0
def build_unsupervised_nmt_model(
        cfg: dict = None,
        src_vocab: Vocabulary = None,
        trg_vocab: Vocabulary = None) -> UnsupervisedNMTModel:
    """
    Build an UnsupervisedNMTModel.

    :param cfg: model configuration
    :param src_vocab: Vocabulary for the src language
    :param trg_vocab: Vocabulary for the trg language
    :return: Unsupervised NMT model as specified in cfg
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    # build source and target embedding layers
    # embeddings in the encoder are pretrained and stay fixed
    loaded_src_embed = PretrainedEmbeddings(**cfg["encoder"]["embeddings"],
                                            vocab_size=len(src_vocab),
                                            padding_idx=src_padding_idx,
                                            vocab=src_vocab,
                                            freeze=True)

    loaded_trg_embed = PretrainedEmbeddings(**cfg["decoder"]["embeddings"],
                                            vocab_size=len(trg_vocab),
                                            padding_idx=trg_padding_idx,
                                            vocab=trg_vocab,
                                            freeze=True)

    # embeddings in the decoder are randomly initialised and will be learned
    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx,
                           freeze=False)

    trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                           vocab_size=len(trg_vocab),
                           padding_idx=trg_padding_idx,
                           freeze=False)

    # build shared encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        shared_encoder = TransformerEncoder(**cfg["encoder"],
                                            emb_size=src_embed.embedding_dim,
                                            emb_dropout=enc_emb_dropout)
    else:
        shared_encoder = RecurrentEncoder(**cfg["encoder"],
                                          emb_size=src_embed.embedding_dim,
                                          emb_dropout=enc_emb_dropout)

    # build src and trg language decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)
    if cfg["decoder"].get("type", "recurrent") == "transformer":
        src_decoder = TransformerDecoder(**cfg["decoder"],
                                         encoder=shared_encoder,
                                         vocab_size=len(src_vocab),
                                         emb_size=src_embed.embedding_dim,
                                         emb_dropout=dec_emb_dropout)
        trg_decoder = TransformerDecoder(**cfg["decoder"],
                                         encoder=shared_encoder,
                                         vocab_size=len(trg_vocab),
                                         emb_size=trg_embed.embedding_dim,
                                         emb_dropout=dec_emb_dropout)
    else:
        src_decoder = RecurrentDecoder(**cfg["decoder"],
                                       encoder=shared_encoder,
                                       vocab_size=len(src_vocab),
                                       emb_size=src_embed.embedding_dim,
                                       emb_dropout=dec_emb_dropout)
        trg_decoder = RecurrentDecoder(**cfg["decoder"],
                                       encoder=shared_encoder,
                                       vocab_size=len(trg_vocab),
                                       emb_size=trg_embed.embedding_dim,
                                       emb_dropout=dec_emb_dropout)

    # build unsupervised NMT model
    model = UnsupervisedNMTModel(loaded_src_embed, loaded_trg_embed, src_embed,
                                 trg_embed, shared_encoder, src_decoder,
                                 trg_decoder, src_vocab, trg_vocab)

    # initialise model
    # embed_initializer should be none so loaded encoder embeddings won't be overwritten
    initialize_model(model.src2src_translator, cfg, src_padding_idx,
                     src_padding_idx)
    initialize_model(model.src2trg_translator, cfg, src_padding_idx,
                     trg_padding_idx)
    initialize_model(model.trg2src_translator, cfg, trg_padding_idx,
                     src_padding_idx)
    initialize_model(model.trg2src_translator, cfg, trg_padding_idx,
                     trg_padding_idx)

    return model