Esempio n. 1
0
  def setUp(self):
    # Seeding
    numpy.random.seed(2)
    random.seed(2)
    layer_dim = 64
    xnmt.events.clear()
    ParamManager.init_param_col()
    self.segment_encoder_bilstm = BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim)
    self.segment_composer = SumComposer()

    self.src_reader = CharFromWordTextReader(vocab=Vocab(vocab_file="examples/data/head.ja.charvocab"))
    self.trg_reader = PlainTextReader(vocab=Vocab(vocab_file="examples/data/head.en.vocab"))
    self.loss_calculator = FeedbackLoss(child_loss=MLELoss(), repeat=5)

    baseline = Linear(input_dim=layer_dim, output_dim=1)
    policy_network = Linear(input_dim=layer_dim, output_dim=2)
    self.poisson_prior = PoissonPrior(mu=3.3)
    self.eps_greedy = EpsilonGreedy(eps_prob=0.0, prior=self.poisson_prior)
    self.conf_penalty = ConfidencePenalty()
    self.policy_gradient = PolicyGradient(input_dim=layer_dim,
                                          output_dim=2,
                                          baseline=baseline,
                                          policy_network=policy_network,
                                          z_normalization=True,
                                          conf_penalty=self.conf_penalty)
    self.length_prior = PoissonLengthPrior(lmbd=3.3, weight=1)
    self.segmenting_encoder = SegmentingSeqTransducer(
      embed_encoder = self.segment_encoder_bilstm,
      segment_composer =  self.segment_composer,
      final_transducer = BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim),
      policy_learning = self.policy_gradient,
      eps_greedy = self.eps_greedy,
      length_prior = self.length_prior,
    )

    self.model = DefaultTranslator(
      src_reader=self.src_reader,
      trg_reader=self.trg_reader,
      src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
      encoder=self.segmenting_encoder,
      attender=MlpAttender(input_dim=layer_dim, state_dim=layer_dim, hidden_dim=layer_dim),
      trg_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
      decoder=AutoRegressiveDecoder(input_dim=layer_dim,
                                    rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim,
                                                             decoder_input_dim=layer_dim, yaml_path="decoder"),
                                    transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim,
                                                           aux_input_dim=layer_dim),
                                    scorer=Softmax(vocab_size=100, input_dim=layer_dim),
                                    trg_embed_dim=layer_dim,
                                    bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
    )
    event_trigger.set_train(True)

    self.layer_dim = layer_dim
    self.src_data = list(self.model.src_reader.read_sents("examples/data/head.ja"))
    self.trg_data = list(self.model.trg_reader.read_sents("examples/data/head.en"))
    my_batcher = batchers.TrgBatcher(batch_size=3)
    self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data)
    dy.renew_cg(immediate_compute=True, check_validity=True)
Esempio n. 2
0
 def setUp(self):
   # Seeding
   np.random.seed(2)
   random.seed(2)
   layer_dim = 4
   xnmt.events.clear()
   ParamManager.init_param_col()
   self.src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab")
   self.src_char_vocab = CharVocab(vocab_file="examples/data/head.ja.vocab")
   self.ngram_vocab = Vocab(vocab_file="examples/data/head.ngramcount.ja")
   self.trg_vocab = Vocab(vocab_file="examples/data/head.en.vocab")
   
   self.src_reader = CharFromWordTextReader(vocab= self.src_vocab, char_vocab= self.src_char_vocab)
   self.trg_reader = PlainTextReader(vocab=self.trg_vocab)
   
   
   self.layer_dim = layer_dim
   self.src_data = list(self.src_reader.read_sents("examples/data/head.ja"))
   self.trg_data = list(self.trg_reader.read_sents("examples/data/head.en"))
   self.src, self.trg = batchers.TrgBatcher(batch_size=3).pack(self.src_data, self.trg_data)
   dy.renew_cg(immediate_compute=True, check_validity=True)
Esempio n. 3
0
    def setUp(self):
        # Seeding
        numpy.random.seed(2)
        random.seed(2)
        layer_dim = 4
        xnmt.events.clear()
        ParamManager.init_param_col()
        self.segment_composer = SumComposer()
        self.src_reader = CharFromWordTextReader(vocab=Vocab(
            vocab_file="examples/data/head.ja.charvocab"))
        self.trg_reader = PlainTextReader(vocab=Vocab(
            vocab_file="examples/data/head.en.vocab"))
        self.loss_calculator = FeedbackLoss(child_loss=MLELoss(), repeat=5)
        self.segmenting_encoder = SegmentingSeqTransducer(
            segment_composer=self.segment_composer,
            final_transducer=BiLSTMSeqTransducer(input_dim=layer_dim,
                                                 hidden_dim=layer_dim),
        )

        self.model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=self.segmenting_encoder,
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="decoder"),
                transform=AuxNonLinear(input_dim=layer_dim,
                                       output_dim=layer_dim,
                                       aux_input_dim=layer_dim),
                scorer=Softmax(vocab_size=100, input_dim=layer_dim),
                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(True)

        self.layer_dim = layer_dim
        self.src_data = list(
            self.model.src_reader.read_sents("examples/data/head.ja"))
        self.trg_data = list(
            self.model.trg_reader.read_sents("examples/data/head.en"))
        my_batcher = batchers.TrgBatcher(batch_size=3)
        self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data)
        dy.renew_cg(immediate_compute=True, check_validity=True)
Esempio n. 4
0
class TestEmbedder(unittest.TestCase):
  def setUp(self):
    # Seeding
    np.random.seed(2)
    random.seed(2)
    layer_dim = 4
    xnmt.events.clear()
    ParamManager.init_param_col()
    self.src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab")
    self.src_char_vocab = CharVocab(vocab_file="examples/data/head.ja.vocab")
    self.ngram_vocab = Vocab(vocab_file="examples/data/head.ngramcount.ja")
    self.trg_vocab = Vocab(vocab_file="examples/data/head.en.vocab")
    
    self.src_reader = CharFromWordTextReader(vocab= self.src_vocab, char_vocab= self.src_char_vocab)
    self.trg_reader = PlainTextReader(vocab=self.trg_vocab)
    
    
    self.layer_dim = layer_dim
    self.src_data = list(self.src_reader.read_sents("examples/data/head.ja"))
    self.trg_data = list(self.trg_reader.read_sents("examples/data/head.en"))
    self.src, self.trg = batchers.TrgBatcher(batch_size=3).pack(self.src_data, self.trg_data)
    dy.renew_cg(immediate_compute=True, check_validity=True)

  def test_lookup_composer(self):
    embedder = LookupEmbedder(emb_dim=self.layer_dim, vocab_size=100)
    embedder.embed_sent(self.src[1])
    embedder.embed(self.src[1][1][1])
    
  def test_sum_composer(self):
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=SumComposer(),
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])

  def test_avg_composer(self):
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=AverageComposer(),
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])

  def test_max_composer(self):
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=MaxComposer(),
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])
    
  def test_conv_composer(self):
    composer = ConvolutionComposer(ngram_size=2,
                                   transform=NonLinear(self.layer_dim, self.layer_dim, activation="relu"),
                                   embed_dim=self.layer_dim,
                                   hidden_dim=self.layer_dim)
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    embedder.embed_sent(self.src[1])
    
  def test_transducer_composer(self):
    composer = SeqTransducerComposer(seq_transducer=BiLSTMSeqTransducer(input_dim=self.layer_dim,
                                                                        hidden_dim=self.layer_dim))
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_bagofwords_embedder(self):
    embedder = BagOfWordsEmbedder(self.layer_dim, char_vocab=self.src_char_vocab, ngram_vocab= self.ngram_vocab, ngram_size=3)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_bagofwords_embedder_with_word_vocab(self):
    embedder = BagOfWordsEmbedder(self.layer_dim, word_vocab=self.src_vocab, ngram_vocab= self.ngram_vocab, ngram_size=3)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_dyer_composer(self):
    composer = DyerHeadComposer(fwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                bwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                transform=AuxNonLinear(input_dim=self.layer_dim,
                                                       output_dim=self.layer_dim,
                                                       aux_input_dim=self.layer_dim))
    embedder = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])

  def test_composite_composer(self):
    composer = DyerHeadComposer(fwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                bwd_combinator=UniLSTMSeqTransducer(input_dim=self.layer_dim, hidden_dim=self.layer_dim),
                                transform=AuxNonLinear(input_dim=self.layer_dim,
                                                       output_dim=self.layer_dim,
                                                       aux_input_dim=self.layer_dim))
    embedder_1 = CharCompositionEmbedder(emb_dim=self.layer_dim,
                                       composer=composer,
                                       char_vocab=self.src_char_vocab)
    embedder_2 = LookupEmbedder(emb_dim=self.layer_dim, vocab_size=100)
    embedder = CompositeEmbedder(embedders=[embedder_1, embedder_2])
    event_trigger.set_train(True)
    event_trigger.start_sent(self.src[1])
    embedder.embed_sent(self.src[1])
    embedder.embed(self.src[1][0].words[0])

  def test_segmented_word(self):
    a = SegmentedWord([1,2,3], 10)
    b = SegmentedWord([1,2,3], 10)
    c = SegmentedWord([2,3,4], 10)
    d = SegmentedWord([1,2,3], 9)

    self.assertEqual(a, b)
    self.assertEqual(a, [1,2,3])
    self.assertEqual(a, 10)
    self.assertNotEqual(a, c)
    self.assertNotEqual(a, d)
    
    self.assertNotEqual(type(self.src[0][0][0]), SegmentedWord)
    self.assertEqual(type(self.src[0][0].words[0]), SegmentedWord)