Exemple #1
0
def main(config, model_weight=None, opt_weight=None):
    print("==== train.py main =====")
    def print_config(config):
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))
    print_config(config)

    if config.dsl:
        print("==== train.py config.dsl => Dataloader 실행 =====")
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:12], config.lang[12:]),
            batch_size=config.lm_batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl,
        )

        from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer
        print("==== train.py language_model 2개 실행 =====")
        language_models = [
            LanguageModel(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            LanguageModel(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]
        print("==== train.py language_model 2개 실행 end =====")
        print("==== train.py models (seq2seq) 2개 실행 =====")
        models = [
            Seq2Seq(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.tgt.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            Seq2Seq(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.src.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]
        print("==== train.py models (seq2seq) 2개 실행 end =====")

        loss_weights = [
            torch.ones(len(loader.tgt.vocab)),
            torch.ones(len(loader.src.vocab)),
        ]
        loss_weights[0][data_loader.PAD] = .0
        loss_weights[1][data_loader.PAD] = .0

        crits = [
            nn.NLLLoss(weight=loss_weights[0], reduction='none'),
            nn.NLLLoss(weight=loss_weights[1], reduction='none'),
        ]

        print(language_models)
        print(models)
        print(crits)

        if model_weight is not None:
            print("train.py - if model_weight is not None: 에 걸렸다!")
            for model, w in zip(models + language_models, model_weight):
                model.load_state_dict(w)
            print("model의 정체는..? ", model)

        if config.gpu_id >= 0:
            for lm, seq2seq, crit in zip(language_models, models, crits):
                lm.cuda(config.gpu_id)
                seq2seq.cuda(config.gpu_id)
                crit.cuda(config.gpu_id)

        for lm, crit in zip(language_models, crits):
            print("==== train.py for문 lm 모델 하나씩 실행 =====")
            optimizer = optim.Adam(lm.parameters())
            lm_trainer = LMTrainer(config)

            lm_trainer.train(
                lm, crit, optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab if lm.vocab_size == len(loader.src.vocab) else None,
                tgt_vocab=loader.tgt.vocab if lm.vocab_size == len(loader.tgt.vocab) else None,
                n_epochs=config.lm_n_epochs,
            )
            print("==== train.py for문 lm 모델 하나씩 실행 =====")


        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:12], config.lang[12:]),
            batch_size=config.batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            # dsl=config.dsl
        )

        from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer
        dsl_trainer = DSLTrainer(config)

        optimizers = [
            optim.Adam(models[0].parameters()),
            optim.Adam(models[1].parameters()),
        ]

        if opt_weight is not None:
            for opt, w in zip(optimizers, opt_weight):
                opt.load_state_dict(w)

        dsl_trainer.train(
            models,
            language_models,
            crits,
            optimizers,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            vocabs=[loader.src.vocab, loader.tgt.vocab],
            n_epochs=config.n_epochs + config.dsl_n_epochs,
            lr_schedulers=None,
    )
    else:
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:12], config.lang[12:]),
            batch_size=config.batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl
        )

        #from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer
        # Encoder's embedding layer input size
        input_size = len(loader.src.vocab)
        # Decoder's embedding layer input size and Generator's softmax layer output size
        output_size = len(loader.tgt.vocab)
        # Declare the model
        # if config.use_transformer:
        #     model = Transformer(
        #         input_size,
        #         config.hidden_size,
        #         output_size,
        #         n_splits=config.n_splits,
        #         n_enc_blocks=config.n_layers,
        #         n_dec_blocks=config.n_layers,
        #         dropout_p=config.dropout,
        #     )
        # else:
        model = Seq2Seq(input_size,
                        config.word_vec_size,  # Word embedding vector size
                        config.hidden_size,  # LSTM's hidden vector size
                        output_size,
                        n_layers=config.n_layers,  # number of layers in LSTM
                        dropout_p=config.dropout  # dropout-rate in LSTM
                        )

        # Default weight for loss equals to 1, but we don't need to get loss for PAD token.
        # Thus, set a weight for PAD to zero.
        loss_weight = torch.ones(output_size)
        loss_weight[data_loader.PAD] = 0.
        # Instead of using Cross-Entropy loss,
        # we can use Negative Log-Likelihood(NLL) loss with log-probability.
        crit = nn.NLLLoss(weight=loss_weight,
                          reduction='sum'
                          )

        print(model)
        print(crit)

        if model_weight is not None:
            model.load_state_dict(model_weight)

        # Pass models to GPU device if it is necessary.
        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

        if config.use_adam:
            if config.use_transformer:
                # optimizer = optim.Adam(model.parameters(), lr=config.hidden_size**(-.5), betas=(.9, .98))
                optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(.9, .98))
            else: # case of rnn based seq2seq.
                optimizer = optim.Adam(model.parameters(), lr=config.lr)
        else:
            optimizer = optim.SGD(model.parameters(), lr=config.lr)

        if opt_weight is not None and config.use_adam:
            optimizer.load_state_dict(opt_weight)

        if config.use_noam_decay:
            f = lambda step: min((step + 1)**(-.5), (step + 1) * config.lr_n_warmup_steps**(-1.5))
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
        else:
            if config.lr_step > 0:
                lr_scheduler = optim.lr_scheduler.MultiStepLR(
                    optimizer,
                    milestones=[i for i in range(max(0, config.lr_decay_start - 1),
                                                 (config.init_epoch - 1) + config.n_epochs,
                                                 config.lr_step)],
                    gamma=config.lr_gamma
                )

                for _ in range(config.init_epoch - 1):
                    lr_scheduler.step()
            else:
                lr_scheduler = None

        print(optimizer)
Exemple #2
0
    if saved_data:
        loader.load_vocab(saved_data['src_vocab'], saved_data['tgt_vocab'])
    elif config.tgt_vocab_path:
        loader.load_target_vocab(pickle.load(open(config.tgt_vocab_path,
                                                  'rb')))

    print(loader.isSrcPremise.vocab.itos, file=sys.stderr)
    # Encoder's embedding layer input size
    input_size = len(loader.src.vocab)
    # Decoder's embedding layer input size and Generator's softmax layer output size
    output_size = len(loader.tgt.vocab)
    # Declare the model
    model = Seq2Seq(
        input_size,
        config.word_vec_dim,  # Word embedding vector size
        config.hidden_size,  # LSTM's hidden vector size
        output_size,
        n_layers=config.n_layers,  # number of layers in LSTM
        dropout_p=config.dropout  # dropout-rate in LSTM
    )

    # Default weight for loss equals to 1, but we don't need to get loss for PAD token.
    # Thus, set a weight for PAD to zero.
    loss_weight = torch.ones(output_size)
    loss_weight[data_loader.PAD] = 0.
    # Instead of using Cross-Entropy loss, we can use Negative Log-Likelihood(NLL) loss with log-probability.
    criterion = nn.NLLLoss(weight=loss_weight, size_average=False)

    if not config.pretrain:
        assert config.reward_mode in [
            'nli', 'bleu', 'combined'
        ], "the reward mode should be one of ['nli', 'bleu', 'combined']"
    # if train_config.use_transformer:
    #     model = Transformer(
    #         input_size,
    #         train_config.hidden_size,
    #         output_size,
    #         n_splits=train_config.n_splits,
    #         n_enc_blocks=train_config.n_layers,
    #         n_dec_blocks=train_config.n_layers,
    #         dropout_p=train_config.dropout,
    #     )

    model = Seq2Seq(
        input_size,
        train_config.word_vec_size,
        train_config.hidden_size,
        output_size,
        n_layers=train_config.n_layers,
        dropout_p=train_config.dropout,
        #search=SingleBeamSearchSpace()
    )
    print("===== 18 model :  =====", model)

    if train_config.dsl:
        if not is_reverse:
            print("===== 19 if not is_reverse =====")
            model.load_state_dict(saved_data['model'][0])
        else:
            print("===== 20 if not is_reverse ELSE=====")
            model.load_state_dict(saved_data['model'][1])
    else:
        print("===== 21 train_config.dsl ELSE =====")
Exemple #4
0
from simple_nmt.encoder import Encoder
from simple_nmt.decoder import Decoder
from simple_nmt.seq2seq import Seq2Seq
from data_loader import DataLoader
#from train import
from hyperparams import Hyperparams

if __name__ == "__main__":
    hparams = Hyperparams()
    cuda = hparams.use_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if cuda else 'cpu')

    enc, enc_hidden = Encoder()
    dec, dec_hidden = Decoder()
    model = Seq2Seq(enc, dec)

    model.flatten_parameters()  # 일자로 펴 준다
    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=hparams.lr)

    # loss function의 변수로 criterion 대게 씀. reduction은 loss를 어떻게 넘길지인데, default는 mean이지만 sum이 더 빠르다고 함. 더 정확한 것은 mean
    # ignore_index는 loss 계산시 무시할 인덱스인데, PADDING 된 것들에는 loss 계산할 필요가 없다.
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    #########
    # 데이터 로딩 & 벡터화
    # (input(한글문장) & target(영어문장) 로드, 벡터로까지 만드는 과정 필요. (sentencepiece , 임베딩))
    #########
Exemple #5
0
    _print_config(config)

    # Load training and validation data set.
    loader = DataLoader(config.train,
                        config.valid, (config.lang[:2], config.lang[-2:]),
                        batch_size=config.batch_size,
                        device=config.gpu_id,
                        max_length=config.max_length,
                        dsl=config.dsl)

    if config.dsl:  # In case of the dual supervised training mode is turn-on.
        # Because we must train both models in same time, we need to declare both models.
        models = [
            Seq2Seq(len(loader.src.vocab),
                    config.word_vec_dim,
                    config.hidden_size,
                    len(loader.tgt.vocab),
                    n_layers=config.n_layers,
                    dropout_p=config.dropout),
            Seq2Seq(len(loader.tgt.vocab),
                    config.word_vec_dim,
                    config.hidden_size,
                    len(loader.src.vocab),
                    n_layers=config.n_layers,
                    dropout_p=config.dropout)
        ]
        # Because we also need to get P(src) and P(tgt), we need language models consist of LSTM.
        language_models = [
            LanguageModel(len(loader.tgt.vocab),
                          config.word_vec_dim,
                          config.hidden_size,
                          n_layers=config.n_layers,
Exemple #6
0
def main(config, model_weight=None, opt_weight=None):
    def print_config(config):
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    print_config(config)

    if config.dsl:
        loader = DataLoader(
            config.train,
            config.valid,
            (config.lang[:2], config.lang[-2:]),
            batch_size=config.lm_batch_size,
            device=config.gpu_id,
            max_length=config.max_length,
            dsl=config.dsl,
        )

        from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer

        language_models = [
            LanguageModel(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            LanguageModel(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]

        models = [
            Seq2Seq(
                len(loader.src.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.tgt.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
            Seq2Seq(
                len(loader.tgt.vocab),
                config.word_vec_size,
                config.hidden_size,
                len(loader.src.vocab),
                n_layers=config.n_layers,
                dropout_p=config.dropout,
            ),
        ]

        loss_weights = [
            torch.ones(len(loader.tgt.vocab)),
            torch.ones(len(loader.src.vocab)),
        ]
        loss_weights[0][data_loader.PAD] = .0
        loss_weights[1][data_loader.PAD] = .0

        crits = [
            nn.NLLLoss(weight=loss_weights[0], reduction='none'),
            nn.NLLLoss(weight=loss_weights[1], reduction='none'),
        ]

        print(language_models)
        print(models)
        print(crits)

        if model_weight is not None:
            for model, w in zip(models + language_models, model_weight):
                model.load_state_dict(w)

        if config.gpu_id >= 0:
            for lm, seq2seq, crit in zip(language_models, models, crits):
                lm.cuda(config.gpu_id)
                seq2seq.cuda(config.gpu_id)
                crit.cuda(config.gpu_id)

        for lm, crit in zip(language_models, crits):
            optimizer = optim.Adam(lm.parameters())
            lm_trainer = LMTrainer(config)

            lm_trainer.train(
                lm,
                crit,
                optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab
                if lm.vocab_size == len(loader.src.vocab) else None,
                tgt_vocab=loader.tgt.vocab
                if lm.vocab_size == len(loader.tgt.vocab) else None,
                n_epochs=config.lm_n_epochs,
            )

        loader = DataLoader(config.train,
                            config.valid, (config.lang[:2], config.lang[-2:]),
                            batch_size=config.batch_size,
                            device=config.gpu_id,
                            max_length=config.max_length,
                            dsl=config.dsl)

        from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer
        dsl_trainer = DSLTrainer(config)

        optimizers = [
            optim.Adam(models[0].parameters()),
            optim.Adam(models[1].parameters()),
        ]

        if opt_weight is not None:
            for opt, w in zip(optimizers, opt_weight):
                opt.load_state_dict(w)

        dsl_trainer.train(
            models,
            language_models,
            crits,
            optimizers,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            vocabs=[loader.src.vocab, loader.tgt.vocab],
            n_epochs=config.n_epochs + config.dsl_n_epochs,
            lr_schedulers=None,
        )
    else:
        loader = DataLoader(config.train,
                            config.valid, (config.lang[:2], config.lang[-2:]),
                            batch_size=config.batch_size,
                            device=config.gpu_id,
                            max_length=config.max_length,
                            dsl=config.dsl)

        from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer
        # Encoder's embedding layer input size
        input_size = len(loader.src.vocab)
        # Decoder's embedding layer input size and Generator's softmax layer output size
        output_size = len(loader.tgt.vocab)
        # Declare the model
        if config.use_transformer:
            model = Transformer(
                input_size,
                config.hidden_size,
                output_size,
                n_splits=config.n_splits,
                n_enc_blocks=config.n_layers,
                n_dec_blocks=config.n_layers,
                dropout_p=config.dropout,
            )
        else:
            model = Seq2Seq(
                input_size,
                config.word_vec_size,  # Word embedding vector size
                config.hidden_size,  # LSTM's hidden vector size
                output_size,
                n_layers=config.n_layers,  # number of layers in LSTM
                dropout_p=config.dropout  # dropout-rate in LSTM
            )

        # Default weight for loss equals to 1, but we don't need to get loss for PAD token.
        # Thus, set a weight for PAD to zero.
        loss_weight = torch.ones(output_size)
        loss_weight[data_loader.PAD] = 0.
        # Instead of using Cross-Entropy loss,
        # we can use Negative Log-Likelihood(NLL) loss with log-probability.
        crit = nn.NLLLoss(weight=loss_weight, reduction='sum')

        print(model)
        print(crit)

        if model_weight is not None:
            model.load_state_dict(model_weight)

        # Pass models to GPU device if it is necessary.
        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

        if config.use_adam:
            if config.use_transformer:
                optimizer = optim.Adam(model.parameters(),
                                       lr=config.lr,
                                       betas=(.9, .98))
            else:  # case of rnn based seq2seq.
                optimizer = optim.Adam(model.parameters(), lr=config.lr)
        else:
            optimizer = optim.SGD(model.parameters(), lr=config.lr)

        if opt_weight is not None and config.use_adam:
            optimizer.load_state_dict(opt_weight)

        if config.lr_step > 0:
            lr_scheduler = optim.lr_scheduler.MultiStepLR(
                optimizer,
                milestones=[
                    i for i in range(max(0, config.lr_decay_start -
                                         1), config.n_epochs, config.lr_step)
                ],
                gamma=config.lr_gamma)
        else:
            lr_scheduler = None

        print(optimizer)

        # Start training. This function maybe equivalant to 'fit' function in Keras.
        mle_trainer = MLETrainer(config)
        mle_trainer.train(
            model,
            crit,
            optimizer,
            train_loader=loader.train_iter,
            valid_loader=loader.valid_iter,
            src_vocab=loader.src.vocab,
            tgt_vocab=loader.tgt.vocab,
            n_epochs=config.n_epochs,
            lr_scheduler=lr_scheduler,
        )

        if config.rl_n_epochs > 0:
            optimizer = optim.SGD(model.parameters(), lr=config.rl_lr)

            from simple_nmt.rl_trainer import MinimumRiskTrainer
            mrt_trainer = MinimumRiskTrainer(config)

            mrt_trainer.train(
                model,
                crit,
                optimizer,
                train_loader=loader.train_iter,
                valid_loader=loader.valid_iter,
                src_vocab=loader.src.vocab,
                tgt_vocab=loader.tgt.vocab,
                n_epochs=config.rl_n_epochs,
            )
Exemple #7
0
    else:
        # Load vocabularies from the model.
        src_vocab = saved_data['src_vocab']
        tgt_vocab = saved_data['tgt_vocab']

    # Initialize dataloader, but we don't need to read training & test corpus.
    # What we need is just load vocabularies from the previously trained model.
    loader = DataLoader()
    loader.load_vocab(src_vocab, tgt_vocab)
    input_size = len(loader.src.vocab)
    output_size = len(loader.tgt.vocab)

    # Declare sequence-to-sequence model.
    model = Seq2Seq(input_size,
                    train_config.word_vec_dim,
                    train_config.hidden_size,
                    output_size,
                    n_layers=train_config.n_layers,
                    dropout_p=train_config.dropout)

    if train_config.dsl:
        if not is_reverse:
            model.load_state_dict(saved_data['models'][0])
        else:
            model.load_state_dict(saved_data['models'][1])
    else:
        model.load_state_dict(
            saved_data['model']
        )  # Load weight parameters from the trained model.
    model.eval(
    )  # We need to turn-on the evaluation mode, which turns off all drop-outs.