コード例 #1
0
ファイル: train.py プロジェクト: TejNote/Korean-English-NMT
def main(config, model_weight=None, opt_weight=None):
    print("==== train.py main =====")
    def print_config(config):
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))
    print_config(config)

    if config.dsl:
        print("==== train.py config.dsl => Dataloader 실행 =====")
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:12], config.lang[12:]),
            batch_size=config.lm_batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl,
        )

        from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer
        print("==== train.py language_model 2개 실행 =====")
        language_models = [
            LanguageModel(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            LanguageModel(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]
        print("==== train.py language_model 2개 실행 end =====")
        print("==== train.py models (seq2seq) 2개 실행 =====")
        models = [
            Seq2Seq(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.tgt.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            Seq2Seq(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.src.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]
        print("==== train.py models (seq2seq) 2개 실행 end =====")

        loss_weights = [
            torch.ones(len(loader.tgt.vocab)),
            torch.ones(len(loader.src.vocab)),
        ]
        loss_weights[0][data_loader.PAD] = .0
        loss_weights[1][data_loader.PAD] = .0

        crits = [
            nn.NLLLoss(weight=loss_weights[0], reduction='none'),
            nn.NLLLoss(weight=loss_weights[1], reduction='none'),
        ]

        print(language_models)
        print(models)
        print(crits)

        if model_weight is not None:
            print("train.py - if model_weight is not None: 에 걸렸다!")
            for model, w in zip(models + language_models, model_weight):
                model.load_state_dict(w)
            print("model의 정체는..? ", model)

        if config.gpu_id >= 0:
            for lm, seq2seq, crit in zip(language_models, models, crits):
                lm.cuda(config.gpu_id)
                seq2seq.cuda(config.gpu_id)
                crit.cuda(config.gpu_id)

        for lm, crit in zip(language_models, crits):
            print("==== train.py for문 lm 모델 하나씩 실행 =====")
            optimizer = optim.Adam(lm.parameters())
            lm_trainer = LMTrainer(config)

            lm_trainer.train(
                lm, crit, optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab if lm.vocab_size == len(loader.src.vocab) else None,
                tgt_vocab=loader.tgt.vocab if lm.vocab_size == len(loader.tgt.vocab) else None,
                n_epochs=config.lm_n_epochs,
            )
            print("==== train.py for문 lm 모델 하나씩 실행 =====")


        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:12], config.lang[12:]),
            batch_size=config.batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            # dsl=config.dsl
        )

        from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer
        dsl_trainer = DSLTrainer(config)

        optimizers = [
            optim.Adam(models[0].parameters()),
            optim.Adam(models[1].parameters()),
        ]

        if opt_weight is not None:
            for opt, w in zip(optimizers, opt_weight):
                opt.load_state_dict(w)

        dsl_trainer.train(
            models,
            language_models,
            crits,
            optimizers,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            vocabs=[loader.src.vocab, loader.tgt.vocab],
            n_epochs=config.n_epochs + config.dsl_n_epochs,
            lr_schedulers=None,
    )
    else:
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:12], config.lang[12:]),
            batch_size=config.batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl
        )

        #from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer
        # Encoder's embedding layer input size
        input_size = len(loader.src.vocab)
        # Decoder's embedding layer input size and Generator's softmax layer output size
        output_size = len(loader.tgt.vocab)
        # Declare the model
        # if config.use_transformer:
        #     model = Transformer(
        #         input_size,
        #         config.hidden_size,
        #         output_size,
        #         n_splits=config.n_splits,
        #         n_enc_blocks=config.n_layers,
        #         n_dec_blocks=config.n_layers,
        #         dropout_p=config.dropout,
        #     )
        # else:
        model = Seq2Seq(input_size,
                        config.word_vec_size,  # Word embedding vector size
                        config.hidden_size,  # LSTM's hidden vector size
                        output_size,
                        n_layers=config.n_layers,  # number of layers in LSTM
                        dropout_p=config.dropout  # dropout-rate in LSTM
                        )

        # Default weight for loss equals to 1, but we don't need to get loss for PAD token.
        # Thus, set a weight for PAD to zero.
        loss_weight = torch.ones(output_size)
        loss_weight[data_loader.PAD] = 0.
        # Instead of using Cross-Entropy loss,
        # we can use Negative Log-Likelihood(NLL) loss with log-probability.
        crit = nn.NLLLoss(weight=loss_weight,
                          reduction='sum'
                          )

        print(model)
        print(crit)

        if model_weight is not None:
            model.load_state_dict(model_weight)

        # Pass models to GPU device if it is necessary.
        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

        if config.use_adam:
            if config.use_transformer:
                # optimizer = optim.Adam(model.parameters(), lr=config.hidden_size**(-.5), betas=(.9, .98))
                optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(.9, .98))
            else: # case of rnn based seq2seq.
                optimizer = optim.Adam(model.parameters(), lr=config.lr)
        else:
            optimizer = optim.SGD(model.parameters(), lr=config.lr)

        if opt_weight is not None and config.use_adam:
            optimizer.load_state_dict(opt_weight)

        if config.use_noam_decay:
            f = lambda step: min((step + 1)**(-.5), (step + 1) * config.lr_n_warmup_steps**(-1.5))
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
        else:
            if config.lr_step > 0:
                lr_scheduler = optim.lr_scheduler.MultiStepLR(
                    optimizer,
                    milestones=[i for i in range(max(0, config.lr_decay_start - 1),
                                                 (config.init_epoch - 1) + config.n_epochs,
                                                 config.lr_step)],
                    gamma=config.lr_gamma
                )

                for _ in range(config.init_epoch - 1):
                    lr_scheduler.step()
            else:
                lr_scheduler = None

        print(optimizer)
コード例 #2
0
ファイル: train.py プロジェクト: back2zion/simple-nmt
def main(config, model_weight=None, opt_weight=None):
    def print_config(config):
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    print_config(config)

    if config.dsl:
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:2], config.lang[-2:]),
            batch_size=config.lm_batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl,
        )

        language_models = [
            LanguageModel(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            LanguageModel(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]

        models = [
            Seq2Seq(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.tgt.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            Seq2Seq(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.src.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]

        loss_weights = [
            torch.ones(len(loader.tgt.vocab)),
            torch.ones(len(loader.src.vocab)),
        ]
        loss_weights[0][data_loader.PAD] = .0
        loss_weights[1][data_loader.PAD] = .0

        crits = [
            nn.NLLLoss(weight=loss_weights[0], reduction='none'),
            nn.NLLLoss(weight=loss_weights[1], reduction='none'),
        ]

        print(language_models)
        print(models)
        print(crits)

        if model_weight is not None:
            for model, w in zip(models + language_models, model_weight):
                model.load_state_dict(w)

        if config.gpu_id >= 0:
            for lm, seq2seq, crit in zip(language_models, models, crits):
                lm.cuda(config.gpu_id)
                seq2seq.cuda(config.gpu_id)
                crit.cuda(config.gpu_id)

        for lm, crit in zip(language_models, crits):
            optimizer = optim.Adam(lm.parameters())
            lm_trainer = LMTrainer(config)

            lm_trainer.train(
                lm,
                crit,
                optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab
                if lm.vocab_size == len(loader.src.vocab) else None,
                tgt_vocab=loader.tgt.vocab
                if lm.vocab_size == len(loader.tgt.vocab) else None,
                n_epochs=config.lm_n_epochs,
            )

        loader = DataLoader(config.train,
                            config.valid, (config.lang[:2], config.lang[-2:]),
                            batch_size=config.batch_size,
                            device=config.gpu_id,
                            max_length=config.max_length,
                            dsl=config.dsl)

        dsl_trainer = DSLTrainer(config)

        optimizers = [
            optim.Adam(models[0].parameters()),
            optim.Adam(models[1].parameters()),
        ]

        if opt_weight is not None:
            for opt, w in zip(optimizers, opt_weight):
                opt.load_state_dict(w)

        dsl_trainer.train(
            models,
            language_models,
            crits,
            optimizers,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            vocabs=[loader.src.vocab, loader.tgt.vocab],
            n_epochs=config.n_epochs + config.dsl_n_epochs,
            lr_schedulers=None,
        )
    else:
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:2], config.lang[-2:]),
            batch_size=config.batch_size,
            device=-1,  #config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl)

        # Encoder's embedding layer input size
        input_size = len(loader.src.vocab)
        # Decoder's embedding layer input size and Generator's softmax layer output size
        output_size = len(loader.tgt.vocab)
        # Declare the model
        if config.use_transformer:
            model = Transformer(
                input_size,
                config.hidden_size,
                output_size,
                n_splits=config.n_splits,
                n_enc_blocks=config.n_layers,
                n_dec_blocks=config.n_layers,
                dropout_p=config.dropout,
            )
        else:
            model = Seq2Seq(
                input_size,
                config.word_vec_size,  # Word embedding vector size
                config.hidden_size,  # LSTM's hidden vector size
                output_size,
                n_layers=config.n_layers,  # number of layers in LSTM
                dropout_p=config.dropout  # dropout-rate in LSTM
            )

        # Default weight for loss equals to 1, but we don't need to get loss for PAD token.
        # Thus, set a weight for PAD to zero.
        loss_weight = torch.ones(output_size)
        loss_weight[data_loader.PAD] = 0.
        # Instead of using Cross-Entropy loss,
        # we can use Negative Log-Likelihood(NLL) loss with log-probability.
        crit = nn.NLLLoss(weight=loss_weight, reduction='sum')

        print(model)
        print(crit)

        if model_weight is not None:
            model.load_state_dict(model_weight)

        # Pass models to GPU device if it is necessary.
        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

        if config.use_adam:
            if config.use_transformer:
                no_decay = ['bias', 'LayerNorm.weight']
                optimizer_grouped_parameters = [{
                    'params': [
                        p for n, p in model.named_parameters()
                        if not any(nd in n for nd in no_decay)
                    ],
                    'weight_decay':
                    0.01
                }, {
                    'params': [
                        p for n, p in model.named_parameters()
                        if any(nd in n for nd in no_decay)
                    ],
                    'weight_decay':
                    0.0
                }]

                optimizer = optim.AdamW(
                    optimizer_grouped_parameters,
                    lr=config.lr,
                )
            else:  # case of rnn based seq2seq.
                optimizer = optim.Adam(model.parameters(), lr=config.lr)
        else:
            optimizer = optim.SGD(model.parameters(), lr=config.lr)

        if opt_weight is not None and config.use_adam:
            optimizer.load_state_dict(opt_weight)

        if config.use_noam_decay:
            n_total_iterations = len(
                loader.train_iter
            ) * config.n_epochs / config.iteration_per_update
            n_warmup_steps = int(n_total_iterations * config.lr_warmup_ratio)
            lr_scheduler = get_linear_schedule_with_warmup(
                optimizer, n_warmup_steps, n_total_iterations)
        else:
            if config.lr_step > 0:
                lr_scheduler = optim.lr_scheduler.MultiStepLR(
                    optimizer,
                    milestones=[
                        i for i in range(max(0, config.lr_decay_start -
                                             1), (config.init_epoch - 1) +
                                         config.n_epochs, config.lr_step)
                    ],
                    gamma=config.lr_gamma)

                for _ in range(config.init_epoch - 1):
                    lr_scheduler.step()
            else:
                lr_scheduler = None

        print(optimizer)

        # Start training. This function maybe equivalant to 'fit' function in Keras.
        mle_trainer = SingleTrainer(MaximumLikelihoodEstimationEngine, config)
        mle_trainer.train(
            model,
            crit,
            optimizer,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            src_vocab=loader.src.vocab,
            tgt_vocab=loader.tgt.vocab,
            n_epochs=config.n_epochs,
            lr_scheduler=lr_scheduler,
        )

        if config.rl_n_epochs > 0:
            optimizer = optim.SGD(model.parameters(), lr=config.rl_lr)
            #optimizer = optim.Adam(model.parameters(), lr=config.rl_lr)

            mrt_trainer = SingleTrainer(MinimumRiskTrainingEngine, config)

            mrt_trainer.train(
                model,
                None,  # We don't need criterion for MRT.
                optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab,
                tgt_vocab=loader.tgt.vocab,
                n_epochs=config.rl_n_epochs,
            )
コード例 #3
0
def main(config):
    def print_config(config):
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    print_config(config)

    loader = DataLoader(
        config.train,
        config.valid,
        (config.lang[:2], config.lang[-2:]),
        batch_size=config.batch_size,
        device=-1,
        max_length=config.max_length,
        dsl=True,
    )

    src_vocab_size = len(loader.src.vocab)
    tgt_vocab_size = len(loader.tgt.vocab)

    models = get_models(src_vocab_size, tgt_vocab_size, config)

    crits = get_crits(src_vocab_size,
                      tgt_vocab_size,
                      pad_index=data_loader.PAD)

    if config.gpu_id >= 0:
        for model, crit in zip(models, crits):
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

    if config.verbose >= 2:
        print(models)

    for model, crit in zip(models, crits):
        optimizer = optim.Adam(model.parameters())
        lm_trainer = LMTrainer(config)

        model = lm_trainer.train(
            model,
            crit,
            optimizer,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            src_vocab=loader.src.vocab
            if model.vocab_size == src_vocab_size else None,
            tgt_vocab=loader.tgt.vocab
            if model.vocab_size == tgt_vocab_size else None,
            n_epochs=config.n_epochs,
        )

    torch.save(
        {
            'model': [
                models[0].state_dict(),
                models[1].state_dict(),
            ],
            'config': config,
            'src_vocab': loader.src.vocab,
            'tgt_vocab': loader.tgt.vocab,
        }, config.model_fn)
コード例 #4
0
ファイル: train.py プロジェクト: ppos78/simple-nmt
                                                   crits):
                model.cuda(config.gpu_id)
                language_model.cuda(config.gpu_id)
                crit.cuda(config.gpu_id)

        if saved_data is None or config.dsl_retrain_lm is True or config.dsl_continue_train_lm is True:
            # In case of resuming the language model training
            if saved_data is not None and config.dsl_continue_train_lm is True:
                for lm, lm_weight in zip(language_models, saved_data['lms']):
                    lm.load_state_dict(lm_weight)

            # Start to train the language models for --lm_n_epochs
            for language_model, crit, is_src in zip(language_models, crits,
                                                    [False, True]):
                lm_trainer = LanguageModelTrainer(language_model,
                                                  crit,
                                                  config=config,
                                                  is_src=is_src)
                lm_trainer.train(loader.train_iter,
                                 loader.valid_iter,
                                 verbose=config.verbose)

                language_model.load_state_dict(
                    lm_trainer.best['model'])  # Pick the best one.
                print('A language model from best epoch %d is loaded.' %
                      lm_trainer.best['epoch'])

            if saved_data is not None:
                saved_data['lms'] = [lm.state_dict() for lm in language_models]
        else:
            print('Skip the langauge model training.')
コード例 #5
0
ファイル: train.py プロジェクト: SeonjeongHwang/simple-nmt
def main(config, model_weight=None, opt_weight=None):
    def print_config(config):
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    print_config(config)

    if config.dsl:
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:2], config.lang[-2:]),
            batch_size=config.lm_batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl,
        )

        from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer

        language_models = [
            LanguageModel(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            LanguageModel(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]

        models = [
            Seq2Seq(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.tgt.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            Seq2Seq(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.src.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]

        loss_weights = [
            torch.ones(len(loader.tgt.vocab)),
            torch.ones(len(loader.src.vocab)),
        ]
        loss_weights[0][data_loader.PAD] = .0
        loss_weights[1][data_loader.PAD] = .0

        crits = [
            nn.NLLLoss(weight=loss_weights[0], reduction='none'),
            nn.NLLLoss(weight=loss_weights[1], reduction='none'),
        ]

        print(language_models)
        print(models)
        print(crits)

        if model_weight is not None:
            for model, w in zip(models + language_models, model_weight):
                model.load_state_dict(w)

        if config.gpu_id >= 0:
            for lm, seq2seq, crit in zip(language_models, models, crits):
                lm.cuda(config.gpu_id)
                seq2seq.cuda(config.gpu_id)
                crit.cuda(config.gpu_id)

        for lm, crit in zip(language_models, crits):
            optimizer = optim.Adam(lm.parameters())
            lm_trainer = LMTrainer(config)

            lm_trainer.train(
                lm,
                crit,
                optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab
                if lm.vocab_size == len(loader.src.vocab) else None,
                tgt_vocab=loader.tgt.vocab
                if lm.vocab_size == len(loader.tgt.vocab) else None,
                n_epochs=config.lm_n_epochs,
            )

        loader = DataLoader(config.train,
                            config.valid, (config.lang[:2], config.lang[-2:]),
                            batch_size=config.batch_size,
                            device=config.gpu_id,
                            max_length=config.max_length,
                            dsl=config.dsl)

        from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer
        dsl_trainer = DSLTrainer(config)

        optimizers = [
            optim.Adam(models[0].parameters()),
            optim.Adam(models[1].parameters()),
        ]

        if opt_weight is not None:
            for opt, w in zip(optimizers, opt_weight):
                opt.load_state_dict(w)

        dsl_trainer.train(
            models,
            language_models,
            crits,
            optimizers,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            vocabs=[loader.src.vocab, loader.tgt.vocab],
            n_epochs=config.n_epochs + config.dsl_n_epochs,
            lr_schedulers=None,
        )
    else:
        loader = DataLoader(config.train,
                            config.valid, (config.lang[:2], config.lang[-2:]),
                            batch_size=config.batch_size,
                            device=config.gpu_id,
                            max_length=config.max_length,
                            dsl=config.dsl)

        from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer
        # Encoder's embedding layer input size
        input_size = len(loader.src.vocab)
        # Decoder's embedding layer input size and Generator's softmax layer output size
        output_size = len(loader.tgt.vocab)
        # Declare the model
        if config.use_transformer:
            model = Transformer(
                input_size,
                config.hidden_size,
                output_size,
                n_splits=config.n_splits,
                n_enc_blocks=config.n_layers,
                n_dec_blocks=config.n_layers,
                dropout_p=config.dropout,
            )
        else:
            model = Seq2Seq(
                input_size,
                config.word_vec_size,  # Word embedding vector size
                config.hidden_size,  # LSTM's hidden vector size
                output_size,
                n_layers=config.n_layers,  # number of layers in LSTM
                dropout_p=config.dropout  # dropout-rate in LSTM
            )

        # Default weight for loss equals to 1, but we don't need to get loss for PAD token.
        # Thus, set a weight for PAD to zero.
        loss_weight = torch.ones(output_size)
        loss_weight[data_loader.PAD] = 0.
        # Instead of using Cross-Entropy loss,
        # we can use Negative Log-Likelihood(NLL) loss with log-probability.
        crit = nn.NLLLoss(weight=loss_weight, reduction='sum')

        print(model)
        print(crit)

        if model_weight is not None:
            model.load_state_dict(model_weight)

        # Pass models to GPU device if it is necessary.
        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

        if config.use_adam:
            if config.use_transformer:
                optimizer = optim.Adam(model.parameters(),
                                       lr=config.lr,
                                       betas=(.9, .98))
            else:  # case of rnn based seq2seq.
                optimizer = optim.Adam(model.parameters(), lr=config.lr)
        else:
            optimizer = optim.SGD(model.parameters(), lr=config.lr)

        if opt_weight is not None and config.use_adam:
            optimizer.load_state_dict(opt_weight)

        if config.lr_step > 0:
            lr_scheduler = optim.lr_scheduler.MultiStepLR(
                optimizer,
                milestones=[
                    i for i in range(max(0, config.lr_decay_start -
                                         1), config.n_epochs, config.lr_step)
                ],
                gamma=config.lr_gamma)
        else:
            lr_scheduler = None

        print(optimizer)

        # Start training. This function maybe equivalant to 'fit' function in Keras.
        mle_trainer = MLETrainer(config)
        mle_trainer.train(
            model,
            crit,
            optimizer,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            src_vocab=loader.src.vocab,
            tgt_vocab=loader.tgt.vocab,
            n_epochs=config.n_epochs,
            lr_scheduler=lr_scheduler,
        )

        if config.rl_n_epochs > 0:
            optimizer = optim.SGD(model.parameters(), lr=config.rl_lr)

            from simple_nmt.rl_trainer import MinimumRiskTrainer
            mrt_trainer = MinimumRiskTrainer(config)

            mrt_trainer.train(
                model,
                crit,
                optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab,
                tgt_vocab=loader.tgt.vocab,
                n_epochs=config.rl_n_epochs,
            )
コード例 #6
0
ファイル: dual_train.py プロジェクト: JudePark96/simple-nmt
def main(config, model_weight=None, opt_weight=None):
    def print_config(config):
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    print_config(config)

    loader = DataLoader(
        config.train,
        config.valid,
        (config.lang[:2], config.lang[-2:]),
        batch_size=config.lm_batch_size,
        device=-1,
        max_length=config.max_length,
        dsl=True,
    )

    src_vocab_size = len(loader.src.vocab)
    tgt_vocab_size = len(loader.tgt.vocab)

    language_models, models = get_models(src_vocab_size, tgt_vocab_size,
                                         config)

    crits = get_crits(src_vocab_size,
                      tgt_vocab_size,
                      pad_index=data_loader.PAD)

    if model_weight is not None:
        for model, w in zip(models + language_models, model_weight):
            model.load_state_dict(w)

    if config.gpu_id >= 0:
        for lm, seq2seq, crit in zip(language_models, models, crits):
            lm.cuda(config.gpu_id)
            seq2seq.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

    for lm, crit in zip(language_models, crits):
        optimizer = optim.Adam(lm.parameters())
        lm_trainer = LMTrainer(config)

        lm_trainer.train(
            lm,
            crit,
            optimizer,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            src_vocab=loader.src.vocab
            if lm.vocab_size == src_vocab_size else None,
            tgt_vocab=loader.tgt.vocab
            if lm.vocab_size == tgt_vocab_size else None,
            n_epochs=config.lm_n_epochs,
        )

    loader = DataLoader(
        config.train,
        config.valid,
        (config.lang[:2], config.lang[-2:]),
        batch_size=config.batch_size,
        device=-1,
        max_length=config.max_length,
        dsl=True,
    )

    dsl_trainer = DSLTrainer(config)

    if config.use_transformer:
        optimizers = [
            custom_optim.RAdam(models[0].parameters(), lr=1e-3),
            custom_optim.RAdam(models[1].parameters(), lr=1e-3),
        ]
    else:
        optimizers = [
            optim.Adam(models[0].parameters()),
            optim.Adam(models[1].parameters()),
        ]

    if config.verbose >= 2:
        print(language_models)
        print(models)
        print(crits)
        print(optimizers)

    if opt_weight is not None:
        for opt, w in zip(optimizers, opt_weight):
            opt.load_state_dict(w)

    dsl_trainer.train(
        models,
        language_models,
        crits,
        optimizers,
        train_loader=loader.train_iter,
        valid_loader=loader.valid_iter,
        vocabs=[loader.src.vocab, loader.tgt.vocab],
        n_epochs=config.n_epochs,
        lr_schedulers=None,
    )