Example #1
0
src_vocab = Vocab(vocab_file="examples/data/head.ja.vocab")
trg_vocab = Vocab(vocab_file="examples/data/head.en.vocab")

batcher = SrcBatcher(batch_size=64)

inference = AutoRegressiveInference(batcher=InOrderBatcher(batch_size=1))

layer_dim = 512

model = DefaultTranslator(
  src_reader=PlainTextReader(vocab=src_vocab),
  trg_reader=PlainTextReader(vocab=trg_vocab),
  src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=len(src_vocab)),

  encoder=BiLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim, layers=1),
  attender=MlpAttender(hidden_dim=layer_dim, state_dim=layer_dim, input_dim=layer_dim),
  decoder=AutoRegressiveDecoder(input_dim=layer_dim,
                                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=len(trg_vocab)),
                                rnn=UniLSTMSeqTransducer(input_dim=layer_dim, hidden_dim=layer_dim,
                                                         decoder_input_dim=layer_dim, yaml_path="decoder"),
                                transform=AuxNonLinear(input_dim=layer_dim, output_dim=layer_dim,
                                                       aux_input_dim=layer_dim),
                                scorer=Softmax(vocab_size=len(trg_vocab), input_dim=layer_dim),
                                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
  inference=inference
)

train = SimpleTrainingRegimen(
  name=f"{EXP}",
  model=model,
Example #2
0
    def setUp(self):
        # Seeding
        numpy.random.seed(2)
        random.seed(2)
        layer_dim = 4
        xnmt.events.clear()
        ParamManager.init_param_col()
        self.segment_encoder_bilstm = BiLSTMSeqTransducer(input_dim=layer_dim,
                                                          hidden_dim=layer_dim)
        self.segment_composer = SumComposer()

        self.src_reader = CharFromWordTextReader(vocab=Vocab(
            vocab_file="examples/data/head.ja.charvocab"))
        self.trg_reader = PlainTextReader(vocab=Vocab(
            vocab_file="examples/data/head.en.vocab"))
        self.loss_calculator = FeedbackLoss(child_loss=MLELoss(), repeat=5)

        baseline = Linear(input_dim=layer_dim, output_dim=1)
        policy_network = Linear(input_dim=layer_dim, output_dim=2)
        self.poisson_prior = PoissonPrior(mu=3.3)
        self.eps_greedy = EpsilonGreedy(eps_prob=0.0, prior=self.poisson_prior)
        self.conf_penalty = ConfidencePenalty()
        self.policy_gradient = PolicyGradient(input_dim=layer_dim,
                                              output_dim=2,
                                              baseline=baseline,
                                              policy_network=policy_network,
                                              z_normalization=True,
                                              conf_penalty=self.conf_penalty)
        self.length_prior = PoissonLengthPrior(lmbd=3.3, weight=1)
        self.segmenting_encoder = SegmentingSeqTransducer(
            embed_encoder=self.segment_encoder_bilstm,
            segment_composer=self.segment_composer,
            final_transducer=BiLSTMSeqTransducer(input_dim=layer_dim,
                                                 hidden_dim=layer_dim),
            policy_learning=self.policy_gradient,
            eps_greedy=self.eps_greedy,
            length_prior=self.length_prior,
        )

        self.model = DefaultTranslator(
            src_reader=self.src_reader,
            trg_reader=self.trg_reader,
            src_embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
            encoder=self.segmenting_encoder,
            attender=MlpAttender(input_dim=layer_dim,
                                 state_dim=layer_dim,
                                 hidden_dim=layer_dim),
            decoder=AutoRegressiveDecoder(
                input_dim=layer_dim,
                rnn=UniLSTMSeqTransducer(input_dim=layer_dim,
                                         hidden_dim=layer_dim,
                                         decoder_input_dim=layer_dim,
                                         yaml_path="decoder"),
                transform=AuxNonLinear(input_dim=layer_dim,
                                       output_dim=layer_dim,
                                       aux_input_dim=layer_dim),
                scorer=Softmax(vocab_size=100, input_dim=layer_dim),
                embedder=SimpleWordEmbedder(emb_dim=layer_dim, vocab_size=100),
                bridge=CopyBridge(dec_dim=layer_dim, dec_layers=1)),
        )
        event_trigger.set_train(True)

        self.layer_dim = layer_dim
        self.src_data = list(
            self.model.src_reader.read_sents("examples/data/head.ja"))
        self.trg_data = list(
            self.model.trg_reader.read_sents("examples/data/head.en"))
        my_batcher = batchers.TrgBatcher(batch_size=3)
        self.src, self.trg = my_batcher.pack(self.src_data, self.trg_data)
        dy.renew_cg(immediate_compute=True, check_validity=True)
Example #3
0
 def test_transducer_composer(self):
   enc = self.segmenting_encoder
   enc.segment_composer = SeqTransducerComposer(seq_transducer=BiLSTMSeqTransducer(input_dim=self.layer_dim,
                                                                                   hidden_dim=self.layer_dim))
   event_trigger.set_train(True)
   enc.transduce(self.inp_emb(0))
Example #4
0
    def run(self):
        seed = 13
        random.seed(seed)
        np.random.seed(seed)

        EXP_DIR = os.path.dirname(__file__)
        EXP = "annotmined"

        model_file = f"{EXP_DIR}/results/{EXP}.mod"
        log_file = f"{EXP_DIR}/results/{EXP}.log"

        xnmt.tee.set_out_file(log_file, exp_name=EXP)
        xnmt.tee.utils.dy.DynetParams().set_mem(
            1024)  #Doesnt work figure out how to set memory
        ParamManager.init_param_col()
        ParamManager.param_col.model_file = model_file

        pre_runner = PreprocRunner(tasks=[
            PreprocTokenize(
                in_files=[
                    f'{EXP_DIR}/conala-corpus/' + self.mined_data + '.snippet',
                    f'{EXP_DIR}/conala-corpus/' + self.mined_data + '.intent',
                    f'{EXP_DIR}/conala-corpus/conala-dev.intent',
                    f'{EXP_DIR}/conala-corpus/conala-dev.snippet',
                    f'{EXP_DIR}/conala-corpus/conala-test.intent',
                    f'{EXP_DIR}/conala-corpus/conala-test.snippet'
                ],
                out_files=[
                    f'{EXP_DIR}/conala-corpus/' + self.mined_data +
                    '.tmspm16000.snippet', f'{EXP_DIR}/conala-corpus/' +
                    self.mined_data + '.tmspm16000.intent',
                    f'{EXP_DIR}/conala-corpus/conala-dev.tmspm16000.intent',
                    f'{EXP_DIR}/conala-corpus/conala-dev.tmspm16000.snippet',
                    f'{EXP_DIR}/conala-corpus/conala-test.tmspm16000.intent',
                    f'{EXP_DIR}/conala-corpus/conala-test.tmspm16000.snippet'
                ],
                specs=[{
                    'filenum':
                    'all',
                    'tokenizers': [
                        SentencepieceTokenizer(
                            hard_vocab_limit=False,
                            train_files=[
                                f'{EXP_DIR}/conala-corpus/' + self.mined_data +
                                '.intent', f'{EXP_DIR}/conala-corpus/' +
                                self.mined_data + '.snippet'
                            ],
                            vocab_size=self.vocab_size,
                            model_type=self.model_type,
                            model_prefix='conala-corpus/' + self.mined_data +
                            '.tmspm16000.spm')
                    ]
                }]),
            PreprocVocab(in_files=[
                f'{EXP_DIR}/conala-corpus/' + self.mined_data +
                '.tmspm16000.intent', f'{EXP_DIR}/conala-corpus/' +
                self.mined_data + '.tmspm16000.snippet'
            ],
                         out_files=[
                             f'{EXP_DIR}/conala-corpus/' + self.mined_data +
                             '.tmspm16000.intent.vocab',
                             f'{EXP_DIR}/conala-corpus/' + self.mined_data +
                             '.tmspm16000.snippet.vocab'
                         ],
                         specs=[{
                             'filenum':
                             'all',
                             'filters':
                             [VocabFiltererFreq(min_freq=self.min_freq)]
                         }])
        ],
                                   overwrite=False)

        src_vocab = Vocab(vocab_file=f'{EXP_DIR}/conala-corpus/' +
                          self.mined_data + '.tmspm16000.intent.vocab')
        trg_vocab = Vocab(vocab_file=f'{EXP_DIR}/conala-corpus/' +
                          self.mined_data + '.tmspm16000.snippet.vocab')

        batcher = Batcher(batch_size=64)

        inference = AutoRegressiveInference(search_strategy=BeamSearch(
            len_norm=PolynomialNormalization(apply_during_search=True),
            beam_size=5),
                                            post_process='join-piece')
        layer_dim = self.layer_dim

        if self.embedding == 'SimpleWordEmbedding':
            model = DefaultTranslator(
                src_reader=PlainTextReader(vocab=src_vocab),
                trg_reader=PlainTextReader(vocab=trg_vocab),
                src_embedder=SimpleWordEmbedder(emb_dim=layer_dim,
                                                vocab_size=len(src_vocab)),
                encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                            hidden_dim=layer_dim,
                                            layers=self.layers),
                attender=MlpAttender(hidden_dim=layer_dim,
                                     state_dim=layer_dim,
                                     input_dim=layer_dim),
                trg_embedder=SimpleWordEmbedder(emb_dim=layer_dim,
                                                vocab_size=len(trg_vocab)),
                decoder=AutoRegressiveDecoder(
                    input_dim=layer_dim,
                    rnn=UniLSTMSeqTransducer(
                        input_dim=layer_dim,
                        hidden_dim=layer_dim,
                    ),
                    transform=AuxNonLinear(input_dim=layer_dim,
                                           output_dim=layer_dim,
                                           aux_input_dim=layer_dim),
                    scorer=Softmax(vocab_size=len(trg_vocab),
                                   input_dim=layer_dim),
                    trg_embed_dim=layer_dim,
                    input_feeding=False,
                    bridge=CopyBridge(dec_dim=layer_dim)),
                inference=inference)

        else:
            model = DefaultTranslator(
                src_reader=PlainTextReader(vocab=src_vocab),
                trg_reader=PlainTextReader(vocab=trg_vocab),
                src_embedder=PretrainedSimpleWordEmbedder(
                    filename=self.embedding, emb_dim=100, vocab=src_vocab),
                encoder=BiLSTMSeqTransducer(input_dim=layer_dim,
                                            hidden_dim=layer_dim,
                                            layers=self.layers),
                attender=MlpAttender(hidden_dim=layer_dim,
                                     state_dim=layer_dim,
                                     input_dim=layer_dim),
                trg_embedder=PretrainedSimpleWordEmbedder(
                    filename=self.trg_embedding, emb_dim=100, vocab=trg_vocab),
                decoder=AutoRegressiveDecoder(
                    input_dim=layer_dim,
                    rnn=UniLSTMSeqTransducer(
                        input_dim=layer_dim,
                        hidden_dim=layer_dim,
                    ),
                    transform=AuxNonLinear(input_dim=layer_dim,
                                           output_dim=layer_dim,
                                           aux_input_dim=layer_dim),
                    scorer=Softmax(vocab_size=len(trg_vocab),
                                   input_dim=layer_dim),
                    trg_embed_dim=layer_dim,
                    input_feeding=False,
                    bridge=CopyBridge(dec_dim=layer_dim)),
                inference=inference)

#decoder = AutoRegressiveDecoder(bridge=CopyBridge(),inference=inference))

        train = SimpleTrainingRegimen(
            name=f"{EXP}",
            model=model,
            batcher=WordSrcBatcher(avg_batch_size=64),
            trainer=AdamTrainer(alpha=self.alpha),
            patience=3,
            lr_decay=0.5,
            restart_trainer=True,
            run_for_epochs=self.epochs,
            src_file=f"{EXP_DIR}/conala-corpus/" + self.mined_data +
            ".tmspm16000.intent",
            trg_file=f"{EXP_DIR}/conala-corpus/" + self.mined_data +
            ".tmspm16000.snippet",
            dev_tasks=[
                LossEvalTask(
                    src_file=
                    f"{EXP_DIR}/conala-corpus/conala-dev.tmspm16000.intent",
                    ref_file=
                    f'{EXP_DIR}/conala-corpus/conala-dev.tmspm16000.snippet',
                    model=model,
                    batcher=WordSrcBatcher(avg_batch_size=64)),
                AccuracyEvalTask(
                    eval_metrics='bleu',
                    src_file=
                    f'{EXP_DIR}/conala-corpus/conala-dev.tmspm16000.intent',
                    ref_file=f'{EXP_DIR}/conala-corpus/conala-dev.snippet',
                    hyp_file=f'results/{EXP}.dev.hyp',
                    model=model)
            ])

        evaluate = [
            AccuracyEvalTask(
                eval_metrics="bleu",
                src_file=
                f"{EXP_DIR}/conala-corpus/conala-test.tmspm16000.intent",
                ref_file=f"{EXP_DIR}/conala-corpus/conala-test.snippet",
                hyp_file=f"results/{EXP}.test.hyp",
                inference=inference,
                model=model)
        ]

        standard_experiment = Experiment(exp_global=ExpGlobal(
            default_layer_dim=layer_dim,
            dropout=0.3,
            log_file=log_file,
            model_file=model_file),
                                         name="annotmined",
                                         model=model,
                                         train=train,
                                         evaluate=evaluate)

        # run experiment
        standard_experiment(
            save_fct=lambda: save_to_file(model_file, standard_experiment))

        exit()