Beispiel #1
0
class TestSimultaneousTranslation(unittest.TestCase):
  
  def setUp(self):
    # Seeding
    numpy.random.seed(2)
    random.seed(2)
    layer_dim = 32
    xnmt.events.clear()
    ParamManager.init_param_col()
    
    self.src_reader = PlainTextReader(vocab=Vocab(vocab_file="examples/data/head.ja.vocab"))
    self.trg_reader = PlainTextReader(vocab=Vocab(vocab_file="examples/data/head.en.vocab"))
    self.layer_dim = layer_dim
    self.src_data = list(self.src_reader.read_sents("examples/data/head.ja"))
    self.trg_data = list(self.trg_reader.read_sents("examples/data/head.en"))
    self.input_vocab_size = len(self.src_reader.vocab.i2w)
    self.output_vocab_size = len(self.trg_reader.vocab.i2w)
    self.loss_calculator = loss_calculators.MLELoss()
    
    self.model = SimultaneousTranslator(
      src_reader=self.src_reader,
      trg_reader=self.trg_reader,
      src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=self.input_vocab_size),
      encoder=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim),
      attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim),
      decoder=AutoRegressiveDecoder(input_dim=layer_dim,
                                    rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim,
                                                             decoder_input_dim=layer_dim, yaml_path="decoder"),
                                    transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim,
                                                           aux_input_dim=layer_dim),
                                    scorer=Softmax(vocab_size=self.output_vocab_size, input_dim=layer_dim),
                                    embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=self.output_vocab_size),
                                    bridge=NoBridge(dec_dim=layer_dim, dec_layers=1)),
      policy_train_oracle=False,
      policy_test_oracle=False,
      read_before_write=True,
    )
    event_trigger.set_train(True)
    

    my_batcher = batchers.TrgBatcher(batch_size=3)
    self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data)
    dy.renew_cg(immediate_compute=True, check_validity=True)
  
  def test_train_nll(self):
    event_trigger.set_train(True)
    mle_loss = loss_calculators.MLELoss()
    mle_loss.calc_loss(self.model, self.src[0], self.trg[0])
    event_trigger.set_train(False)
    self.model.generate(batchers.mark_as_batch([self.src_data[0]]), GreedySearch())

  def test_simult_beam(self):
    event_trigger.set_train(False)
    mle_loss = loss_calculators.MLELoss()
    mle_loss.calc_loss(self.model, self.src[0], self.trg[0])
    self.model.generate(batchers.mark_as_batch([self.src_data[0]]), BeamSearch(beam_size=2))
Beispiel #2
0
class PretrainedSimpleWordEmbedderSanityTest(unittest.TestCase):
  def setUp(self):
    events.clear()
    self.input_reader = PlainTextReader(vocab=Vocab(vocab_file="examples/data/head.ja.vocab"))
    list(self.input_reader.read_sents('examples/data/head.ja'))
    ParamManager.init_param_col()

  def test_load(self):
    """
    Checks that the embeddings can be loaded, have the right dimension, and that one line matches.
    """
    embedder = LookupEmbedder(init_fastext='examples/data/wiki.ja.vec.small', emb_dim=300, vocab=self.input_reader.vocab)
    # self.assertEqual(embedder.embeddings.shape()[::-1], (self.input_reader.vocab_size(), 300))

    with open('examples/data/wiki.ja.vec.small', encoding='utf-8') as vecfile:
      test_line = next(islice(vecfile, 9, None)).split()  # Select the vector for '日'
    test_word = test_line[0]
    test_id = self.input_reader.vocab.w2i[test_word]
    test_emb = test_line[1:]

    self.assertTrue(np.allclose(embedder.embeddings.batch([test_id]).npvalue().tolist(),
                                np.array(test_emb, dtype=float).tolist(), rtol=1e-5))
Beispiel #3
0
class TestTruncatedBatchTraining(unittest.TestCase):
    def setUp(self):
        xnmt.events.clear()
        ParamManager.init_param_col()

        self.src_reader = PlainTextReader(vocab=Vocab(
            vocab_file="examples/data/head.ja.vocab"))
        self.trg_reader = PlainTextReader(vocab=Vocab(
            vocab_file="examples/data/head.en.vocab"))
        self.src_data = list(
            self.src_reader.read_sents("examples/data/head.ja"))
        self.trg_data = list(
            self.trg_reader.read_sents("examples/data/head.en"))

    def assert_single_loss_equals_batch_loss(self,
                                             model,
                                             pad_src_to_multiple=1):
        """
    Tests whether single loss equals batch loss.
    Truncating src / trg sents to same length so no masking is necessary
    """
        batch_size = 5
        src_sents = self.src_data[:batch_size]
        src_min = min([x.sent_len() for x in src_sents])
        src_sents_trunc = [s.words[:src_min] for s in src_sents]
        for single_sent in src_sents_trunc:
            single_sent[src_min - 1] = Vocab.ES
            while len(single_sent) % pad_src_to_multiple != 0:
                single_sent.append(Vocab.ES)
        trg_sents = self.trg_data[:batch_size]
        trg_min = min([x.sent_len() for x in trg_sents])
        trg_sents_trunc = [s.words[:trg_min] for s in trg_sents]
        for single_sent in trg_sents_trunc:
            single_sent[trg_min - 1] = Vocab.ES

        src_sents_trunc = [
            sent.SimpleSentence(words=s) for s in src_sents_trunc
        ]
        trg_sents_trunc = [
            sent.SimpleSentence(words=s) for s in trg_sents_trunc
        ]

        single_loss = 0.0
        for sent_id in range(batch_size):
            dy.renew_cg()
            train_loss, _ = MLELoss().calc_loss(
                model=model,
                src=src_sents_trunc[sent_id],
                trg=trg_sents_trunc[sent_id]).compute()
            single_loss += train_loss.value()

        dy.renew_cg()

        batched_loss, _ = MLELoss().calc_loss(
            model=model,
            src=mark_as_batch(src_sents_trunc),
            trg=mark_as_batch(trg_sents_trunc)).compute()
        self.assertAlmostEqual(single_loss,
                               np.sum(batched_loss.value()),
                               places=4)

    def test_loss_model1(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                        hidden_dim=layer_dim),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(False)
        self.assert_single_loss_equals_batch_loss(model)

    def test_loss_model2(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=PyramidalLSTMSeqTransducer(input_dim=layer_dim,
                                               hidden_dim=layer_dim,
                                               layers=3),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(False)
        self.assert_single_loss_equals_batch_loss(model, pad_src_to_multiple=4)

    def test_loss_model3(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                        hidden_dim=layer_dim,
                                        layers=3),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(False)
        self.assert_single_loss_equals_batch_loss(model)

    def test_loss_model4(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                        hidden_dim=layer_dim),
            attender=DotAttender(),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(False)
        self.assert_single_loss_equals_batch_loss(model)
Beispiel #4
0
class TestBatchTraining(unittest.TestCase):
    def setUp(self):
        xnmt.events.clear()
        ParamManager.init_param_col()

        self.src_reader = PlainTextReader(vocab=Vocab(
            vocab_file="examples/data/head.ja.vocab"))
        self.trg_reader = PlainTextReader(vocab=Vocab(
            vocab_file="examples/data/head.en.vocab"))
        self.src_data = list(
            self.src_reader.read_sents("examples/data/head.ja"))
        self.trg_data = list(
            self.trg_reader.read_sents("examples/data/head.en"))

    def assert_single_loss_equals_batch_loss(self,
                                             model,
                                             pad_src_to_multiple=1):
        """
    Tests whether single loss equals batch loss.
    Here we don't truncate the target side and use masking.
    """
        batch_size = 5
        src_sents = self.src_data[:batch_size]
        src_min = min([x.sent_len() for x in src_sents])
        src_sents_trunc = [s.words[:src_min] for s in src_sents]
        for single_sent in src_sents_trunc:
            single_sent[src_min - 1] = Vocab.ES
            while len(single_sent) % pad_src_to_multiple != 0:
                single_sent.append(Vocab.ES)
        trg_sents = sorted(self.trg_data[:batch_size],
                           key=lambda x: x.sent_len(),
                           reverse=True)
        trg_max = max([x.sent_len() for x in trg_sents])
        np_arr = np.zeros([batch_size, trg_max])
        for i in range(batch_size):
            for j in range(trg_sents[i].sent_len(), trg_max):
                np_arr[i, j] = 1.0
        trg_masks = Mask(np_arr)
        trg_sents_padded = [[w for w in s] + [Vocab.ES] *
                            (trg_max - s.sent_len()) for s in trg_sents]

        src_sents_trunc = [
            sent.SimpleSentence(words=s) for s in src_sents_trunc
        ]
        trg_sents_padded = [
            sent.SimpleSentence(words=s) for s in trg_sents_padded
        ]

        single_loss = 0.0
        for sent_id in range(batch_size):
            dy.renew_cg()
            train_loss, _ = MLELoss().calc_loss(
                model=model,
                src=src_sents_trunc[sent_id],
                trg=trg_sents[sent_id]).compute()
            single_loss += train_loss.value()

        dy.renew_cg()

        batched_loss, _ = MLELoss().calc_loss(
            model=model,
            src=mark_as_batch(src_sents_trunc),
            trg=mark_as_batch(trg_sents_padded, trg_masks)).compute()
        self.assertAlmostEqual(single_loss,
                               np.sum(batched_loss.value()),
                               places=4)

    def test_loss_model1(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                        hidden_dim=layer_dim),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(False)
        self.assert_single_loss_equals_batch_loss(model)

    def test_loss_model2(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=PyramidalLSTMSeqTransducer(layers=3,
                                               input_dim=layer_dim,
                                               hidden_dim=layer_dim),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(False)
        self.assert_single_loss_equals_batch_loss(model, pad_src_to_multiple=4)

    def test_loss_model3(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=BiLSTMSeqTransducer(layers=3,
                                        input_dim=layer_dim,
                                        hidden_dim=layer_dim),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=LookupEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(False)
        self.assert_single_loss_equals_batch_loss(model)
Beispiel #5
0
class TestEncoder(unittest.TestCase):
    def setUp(self):
        events.clear()
        ParamManager.init_param_col()

        src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab")
        trg_vocab = Vocab(vocab_file="examples/data/head.en.vocab")
        self.src_reader = PlainTextReader(vocab=src_vocab)
        self.trg_reader = PlainTextReader(vocab=trg_vocab)
        self.src_data = list(
            self.src_reader.read_sents("examples/data/head.ja"))
        self.trg_data = list(
            self.trg_reader.read_sents("examples/data/head.en"))

    def assert_in_out_len_equal(self, model):
        dy.renew_cg()
        event_trigger.set_train(True)
        src = self.src_data[0]
        event_trigger.start_sent(src)
        embeddings = model.src_embedder.embed_sent(src)
        encodings = model.encoder.transduce(embeddings)
        self.assertEqual(len(embeddings), len(encodings))

    def test_bi_lstm_encoder_len(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                        hidden_dim=layer_dim,
                                        layers=3),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        self.assert_in_out_len_equal(model)

    def test_uni_lstm_encoder_len(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        self.assert_in_out_len_equal(model)

    # TODO: Update this to the new residual LSTM transducer framework
    # def test_res_lstm_encoder_len(self):
    #   layer_dim = 512
    #   model = DefaultTranslator(
    #     src_reader=self.src_reader,
    #     trg_reader=self.trg_reader,
    #     src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
    #     encoder=ResidualLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, layers=3),
    #     attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim),
    #     decoder=AutoRegressiveDecoder(input_dim=layer_dim,
    #                               embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
    #                               rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, decoder_input_dim=layer_dim, yaml_path="model.decoder.rnn"),
    #                               transform=NonLinear(input_dim=layer_dim*2, output_dim=layer_dim),
    #                               scorer=Softmax(input_dim=layer_dim, vocab_size=100),
    #                               bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
    #   )
    #   self.assert_in_out_len_equal(model)

    def test_py_lstm_encoder_len(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=PyramidalLSTMSeqTransducer(input_dim=layer_dim,
                                               hidden_dim=layer_dim,
                                               layers=3),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(True)
        for sent_i in range(10):
            dy.renew_cg()
            src = self.src_data[sent_i].create_padded_sent(
                4 - (self.src_data[sent_i].sent_len() % 4))
            event_trigger.start_sent(src)
            embeddings = model.src_embedder.embed_sent(src)
            encodings = model.encoder.transduce(embeddings)
            self.assertEqual(int(math.ceil(len(embeddings) / float(4))),
                             len(encodings))

    def test_py_lstm_mask(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=PyramidalLSTMSeqTransducer(input_dim=layer_dim,
                                               hidden_dim=layer_dim,
                                               layers=1),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )

        batcher = batchers.TrgBatcher(batch_size=3)
        train_src, _ = \
          batcher.pack(self.src_data, self.trg_data)

        event_trigger.set_train(True)
        for sent_i in range(3):
            dy.renew_cg()
            src = train_src[sent_i]
            event_trigger.start_sent(src)
            embeddings = model.src_embedder.embed_sent(src)
            encodings = model.encoder.transduce(embeddings)
            if train_src[sent_i].mask is None:
                assert encodings.mask is None
            else:
                np.testing.assert_array_almost_equal(
                    train_src[sent_i].mask.np_arr, encodings.mask.np_arr)

    def test_multihead_attention_encoder_len(self):
        layer_dim = 512
        model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=MultiHeadAttentionSeqTransducer(input_dim=layer_dim),
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="model.decoder.rnn"),
                transform=NonLinear(input_dim=layer_dim * 2,
                                    output_dim=layer_dim),
                scorer=Softmax(input_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        self.assert_in_out_len_equal(model)
Beispiel #6
0
class TestEmbedder(unittest.TestCase):
  def setUp(self):
    # Seeding
    np.random.seed(2)
    random.seed(2)
    layer_dim = 4
    xnmt.events.clear()
    ParamManager.init_param_col()
    self.src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab")
    self.src_char_vocab = CharVocab(vocab_file="examples/data/head.ja.vocab")
    self.ngram_vocab = Vocab(vocab_file="examples/data/head.ngramcount.ja")
    self.trg_vocab = Vocab(vocab_file="examples/data/head.en.vocab")
    
    self.src_reader = CharFromWordTextReader(vocab= self.src_vocab, char_vocab= self.src_char_vocab)
    self.trg_reader = PlainTextReader(vocab=self.trg_vocab)
    
    
    self.layer_dim = layer_dim
    self.src_data = list(self.src_reader.read_sents("examples/data/head.ja"))
    self.trg_data = list(self.trg_reader.read_sents("examples/data/head.en"))
    self.src, self.trg = batchers.TrgBatcher(batch_size=3).pack(self.src_data, self.trg_data)
    dy.renew_cg(immediate_compute=True, check_validity=True)

  def test_lookup_composer(self):
    embedder = LookupEmbedder(emb_dim=self.layer_dim, vocab_size=100)
    embedder.embed_sent(self.src[1])
    embedder.embed(self.src[1][1][1])
    
  def test_sum_composer(self):
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=SumComposer(),
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])

  def test_avg_composer(self):
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=AverageComposer(),
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])

  def test_max_composer(self):
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=MaxComposer(),
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])
    
  def test_conv_composer(self):
    composer = ConvolutionComposer(ngram_size=2,
                                   transform=NonLinear(self.layer_dim, self.layer_dim, activation="relu"),
                                   embed_dim=self.layer_dim,
                                   hidden_dim=self.layer_dim)
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])
    
  def test_transducer_composer(self):
    composer = SeqTransducerComposer(seq_transducer=BiLSTMSeqTransducer(input_dim=self.layer_dim,
                                                                        hidden_dim=self.layer_dim))
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_bagofwords_embedder(self):
    embedder = BagOfWordsEmbedder(self.layer_dim, char_vocab=self.src_char_vocab, ngram_vocab= self.ngram_vocab, ngram_size=3)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_bagofwords_embedder_with_word_vocab(self):
    embedder = BagOfWordsEmbedder(self.layer_dim, word_vocab=self.src_vocab, ngram_vocab= self.ngram_vocab, ngram_size=3)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_dyer_composer(self):
    composer = DyerHeadComposer(fwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                bwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                transform=AuxNonLinear(input_dim=self.layer_dim,
                                                       output_dim=self.layer_dim,
                                                       aux_input_dim=self.layer_dim))
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_composite_composer(self):
    composer = DyerHeadComposer(fwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                bwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                transform=AuxNonLinear(input_dim=self.layer_dim,
                                                       output_dim=self.layer_dim,
                                                       aux_input_dim=self.layer_dim))
    embedder_1 = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    embedder_2 = LookupEmbedder(emb_dim=self.layer_dim, vocab_size=100)
    embedder = CompositeEmbedder(embedders=[embedder_1, embedder_2])
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])
    embedder.embed(self.src[1][0].words[0])

  def test_segmented_word(self):
    a = SegmentedWord([1,2,3], 10)
    b = SegmentedWord([1,2,3], 10)
    c = SegmentedWord([2,3,4], 10)
    d = SegmentedWord([1,2,3], 9)

    self.assertEqual(a, b)
    self.assertEqual(a, [1,2,3])
    self.assertEqual(a, 10)
    self.assertNotEqual(a, c)
    self.assertNotEqual(a, d)
    
    self.assertNotEqual(type(self.src[0][0][0]), SegmentedWord)
    self.assertEqual(type(self.src[0][0].words[0]), SegmentedWord)