def main(config, model_weight=None, opt_weight=None): print("==== train.py main =====") def print_config(config): import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) print_config(config) if config.dsl: print("==== train.py config.dsl => Dataloader 실행 =====") loader = DataLoader( config.train, config.valid, (config.lang[:12], config.lang[12:]), batch_size=config.lm_batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl, ) from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer print("==== train.py language_model 2개 실행 =====") language_models = [ LanguageModel( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), LanguageModel( len(loader.src.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), ] print("==== train.py language_model 2개 실행 end =====") print("==== train.py models (seq2seq) 2개 실행 =====") models = [ Seq2Seq( len(loader.src.vocab), config.word_vec_size, config.hidden_size, len(loader.tgt.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), Seq2Seq( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, len(loader.src.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), ] print("==== train.py models (seq2seq) 2개 실행 end =====") loss_weights = [ torch.ones(len(loader.tgt.vocab)), torch.ones(len(loader.src.vocab)), ] loss_weights[0][data_loader.PAD] = .0 loss_weights[1][data_loader.PAD] = .0 crits = [ nn.NLLLoss(weight=loss_weights[0], reduction='none'), nn.NLLLoss(weight=loss_weights[1], reduction='none'), ] print(language_models) print(models) print(crits) if model_weight is not None: print("train.py - if model_weight is not None: 에 걸렸다!") for model, w in zip(models + language_models, model_weight): model.load_state_dict(w) print("model의 정체는..? ", model) if config.gpu_id >= 0: for lm, seq2seq, crit in zip(language_models, models, crits): lm.cuda(config.gpu_id) seq2seq.cuda(config.gpu_id) crit.cuda(config.gpu_id) for lm, crit in zip(language_models, crits): print("==== train.py for문 lm 모델 하나씩 실행 =====") optimizer = optim.Adam(lm.parameters()) lm_trainer = LMTrainer(config) lm_trainer.train( lm, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab if lm.vocab_size == len(loader.src.vocab) else None, tgt_vocab=loader.tgt.vocab if lm.vocab_size == len(loader.tgt.vocab) else None, n_epochs=config.lm_n_epochs, ) print("==== train.py for문 lm 모델 하나씩 실행 =====") loader = DataLoader( config.train, config.valid, (config.lang[:12], config.lang[12:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, # dsl=config.dsl ) from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer dsl_trainer = DSLTrainer(config) optimizers = [ optim.Adam(models[0].parameters()), optim.Adam(models[1].parameters()), ] if opt_weight is not None: for opt, w in zip(optimizers, opt_weight): opt.load_state_dict(w) dsl_trainer.train( models, language_models, crits, optimizers, train_loader=loader.train_iter, valid_loader=loader.valid_iter, vocabs=[loader.src.vocab, loader.tgt.vocab], n_epochs=config.n_epochs + config.dsl_n_epochs, lr_schedulers=None, ) else: loader = DataLoader( config.train, config.valid, (config.lang[:12], config.lang[12:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl ) #from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer # Encoder's embedding layer input size input_size = len(loader.src.vocab) # Decoder's embedding layer input size and Generator's softmax layer output size output_size = len(loader.tgt.vocab) # Declare the model # if config.use_transformer: # model = Transformer( # input_size, # config.hidden_size, # output_size, # n_splits=config.n_splits, # n_enc_blocks=config.n_layers, # n_dec_blocks=config.n_layers, # dropout_p=config.dropout, # ) # else: model = Seq2Seq(input_size, config.word_vec_size, # Word embedding vector size config.hidden_size, # LSTM's hidden vector size output_size, n_layers=config.n_layers, # number of layers in LSTM dropout_p=config.dropout # dropout-rate in LSTM ) # Default weight for loss equals to 1, but we don't need to get loss for PAD token. # Thus, set a weight for PAD to zero. loss_weight = torch.ones(output_size) loss_weight[data_loader.PAD] = 0. # Instead of using Cross-Entropy loss, # we can use Negative Log-Likelihood(NLL) loss with log-probability. crit = nn.NLLLoss(weight=loss_weight, reduction='sum' ) print(model) print(crit) if model_weight is not None: model.load_state_dict(model_weight) # Pass models to GPU device if it is necessary. if config.gpu_id >= 0: model.cuda(config.gpu_id) crit.cuda(config.gpu_id) if config.use_adam: if config.use_transformer: # optimizer = optim.Adam(model.parameters(), lr=config.hidden_size**(-.5), betas=(.9, .98)) optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(.9, .98)) else: # case of rnn based seq2seq. optimizer = optim.Adam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) if opt_weight is not None and config.use_adam: optimizer.load_state_dict(opt_weight) if config.use_noam_decay: f = lambda step: min((step + 1)**(-.5), (step + 1) * config.lr_n_warmup_steps**(-1.5)) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f) else: if config.lr_step > 0: lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[i for i in range(max(0, config.lr_decay_start - 1), (config.init_epoch - 1) + config.n_epochs, config.lr_step)], gamma=config.lr_gamma ) for _ in range(config.init_epoch - 1): lr_scheduler.step() else: lr_scheduler = None print(optimizer)
def main(config, model_weight=None, opt_weight=None): def print_config(config): pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) print_config(config) if config.dsl: loader = DataLoader( config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.lm_batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl, ) language_models = [ LanguageModel( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), LanguageModel( len(loader.src.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), ] models = [ Seq2Seq( len(loader.src.vocab), config.word_vec_size, config.hidden_size, len(loader.tgt.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), Seq2Seq( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, len(loader.src.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), ] loss_weights = [ torch.ones(len(loader.tgt.vocab)), torch.ones(len(loader.src.vocab)), ] loss_weights[0][data_loader.PAD] = .0 loss_weights[1][data_loader.PAD] = .0 crits = [ nn.NLLLoss(weight=loss_weights[0], reduction='none'), nn.NLLLoss(weight=loss_weights[1], reduction='none'), ] print(language_models) print(models) print(crits) if model_weight is not None: for model, w in zip(models + language_models, model_weight): model.load_state_dict(w) if config.gpu_id >= 0: for lm, seq2seq, crit in zip(language_models, models, crits): lm.cuda(config.gpu_id) seq2seq.cuda(config.gpu_id) crit.cuda(config.gpu_id) for lm, crit in zip(language_models, crits): optimizer = optim.Adam(lm.parameters()) lm_trainer = LMTrainer(config) lm_trainer.train( lm, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab if lm.vocab_size == len(loader.src.vocab) else None, tgt_vocab=loader.tgt.vocab if lm.vocab_size == len(loader.tgt.vocab) else None, n_epochs=config.lm_n_epochs, ) loader = DataLoader(config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl) dsl_trainer = DSLTrainer(config) optimizers = [ optim.Adam(models[0].parameters()), optim.Adam(models[1].parameters()), ] if opt_weight is not None: for opt, w in zip(optimizers, opt_weight): opt.load_state_dict(w) dsl_trainer.train( models, language_models, crits, optimizers, train_loader=loader.train_iter, valid_loader=loader.valid_iter, vocabs=[loader.src.vocab, loader.tgt.vocab], n_epochs=config.n_epochs + config.dsl_n_epochs, lr_schedulers=None, ) else: loader = DataLoader( config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=-1, #config.gpu_id, max_length=config.max_length, dsl=config.dsl) # Encoder's embedding layer input size input_size = len(loader.src.vocab) # Decoder's embedding layer input size and Generator's softmax layer output size output_size = len(loader.tgt.vocab) # Declare the model if config.use_transformer: model = Transformer( input_size, config.hidden_size, output_size, n_splits=config.n_splits, n_enc_blocks=config.n_layers, n_dec_blocks=config.n_layers, dropout_p=config.dropout, ) else: model = Seq2Seq( input_size, config.word_vec_size, # Word embedding vector size config.hidden_size, # LSTM's hidden vector size output_size, n_layers=config.n_layers, # number of layers in LSTM dropout_p=config.dropout # dropout-rate in LSTM ) # Default weight for loss equals to 1, but we don't need to get loss for PAD token. # Thus, set a weight for PAD to zero. loss_weight = torch.ones(output_size) loss_weight[data_loader.PAD] = 0. # Instead of using Cross-Entropy loss, # we can use Negative Log-Likelihood(NLL) loss with log-probability. crit = nn.NLLLoss(weight=loss_weight, reduction='sum') print(model) print(crit) if model_weight is not None: model.load_state_dict(model_weight) # Pass models to GPU device if it is necessary. if config.gpu_id >= 0: model.cuda(config.gpu_id) crit.cuda(config.gpu_id) if config.use_adam: if config.use_transformer: no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW( optimizer_grouped_parameters, lr=config.lr, ) else: # case of rnn based seq2seq. optimizer = optim.Adam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) if opt_weight is not None and config.use_adam: optimizer.load_state_dict(opt_weight) if config.use_noam_decay: n_total_iterations = len( loader.train_iter ) * config.n_epochs / config.iteration_per_update n_warmup_steps = int(n_total_iterations * config.lr_warmup_ratio) lr_scheduler = get_linear_schedule_with_warmup( optimizer, n_warmup_steps, n_total_iterations) else: if config.lr_step > 0: lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[ i for i in range(max(0, config.lr_decay_start - 1), (config.init_epoch - 1) + config.n_epochs, config.lr_step) ], gamma=config.lr_gamma) for _ in range(config.init_epoch - 1): lr_scheduler.step() else: lr_scheduler = None print(optimizer) # Start training. This function maybe equivalant to 'fit' function in Keras. mle_trainer = SingleTrainer(MaximumLikelihoodEstimationEngine, config) mle_trainer.train( model, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab, tgt_vocab=loader.tgt.vocab, n_epochs=config.n_epochs, lr_scheduler=lr_scheduler, ) if config.rl_n_epochs > 0: optimizer = optim.SGD(model.parameters(), lr=config.rl_lr) #optimizer = optim.Adam(model.parameters(), lr=config.rl_lr) mrt_trainer = SingleTrainer(MinimumRiskTrainingEngine, config) mrt_trainer.train( model, None, # We don't need criterion for MRT. optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab, tgt_vocab=loader.tgt.vocab, n_epochs=config.rl_n_epochs, )
def main(config): def print_config(config): pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) print_config(config) loader = DataLoader( config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=-1, max_length=config.max_length, dsl=True, ) src_vocab_size = len(loader.src.vocab) tgt_vocab_size = len(loader.tgt.vocab) models = get_models(src_vocab_size, tgt_vocab_size, config) crits = get_crits(src_vocab_size, tgt_vocab_size, pad_index=data_loader.PAD) if config.gpu_id >= 0: for model, crit in zip(models, crits): model.cuda(config.gpu_id) crit.cuda(config.gpu_id) if config.verbose >= 2: print(models) for model, crit in zip(models, crits): optimizer = optim.Adam(model.parameters()) lm_trainer = LMTrainer(config) model = lm_trainer.train( model, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab if model.vocab_size == src_vocab_size else None, tgt_vocab=loader.tgt.vocab if model.vocab_size == tgt_vocab_size else None, n_epochs=config.n_epochs, ) torch.save( { 'model': [ models[0].state_dict(), models[1].state_dict(), ], 'config': config, 'src_vocab': loader.src.vocab, 'tgt_vocab': loader.tgt.vocab, }, config.model_fn)
crits): model.cuda(config.gpu_id) language_model.cuda(config.gpu_id) crit.cuda(config.gpu_id) if saved_data is None or config.dsl_retrain_lm is True or config.dsl_continue_train_lm is True: # In case of resuming the language model training if saved_data is not None and config.dsl_continue_train_lm is True: for lm, lm_weight in zip(language_models, saved_data['lms']): lm.load_state_dict(lm_weight) # Start to train the language models for --lm_n_epochs for language_model, crit, is_src in zip(language_models, crits, [False, True]): lm_trainer = LanguageModelTrainer(language_model, crit, config=config, is_src=is_src) lm_trainer.train(loader.train_iter, loader.valid_iter, verbose=config.verbose) language_model.load_state_dict( lm_trainer.best['model']) # Pick the best one. print('A language model from best epoch %d is loaded.' % lm_trainer.best['epoch']) if saved_data is not None: saved_data['lms'] = [lm.state_dict() for lm in language_models] else: print('Skip the langauge model training.')
def main(config, model_weight=None, opt_weight=None): def print_config(config): import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) print_config(config) if config.dsl: loader = DataLoader( config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.lm_batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl, ) from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer language_models = [ LanguageModel( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), LanguageModel( len(loader.src.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), ] models = [ Seq2Seq( len(loader.src.vocab), config.word_vec_size, config.hidden_size, len(loader.tgt.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), Seq2Seq( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, len(loader.src.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), ] loss_weights = [ torch.ones(len(loader.tgt.vocab)), torch.ones(len(loader.src.vocab)), ] loss_weights[0][data_loader.PAD] = .0 loss_weights[1][data_loader.PAD] = .0 crits = [ nn.NLLLoss(weight=loss_weights[0], reduction='none'), nn.NLLLoss(weight=loss_weights[1], reduction='none'), ] print(language_models) print(models) print(crits) if model_weight is not None: for model, w in zip(models + language_models, model_weight): model.load_state_dict(w) if config.gpu_id >= 0: for lm, seq2seq, crit in zip(language_models, models, crits): lm.cuda(config.gpu_id) seq2seq.cuda(config.gpu_id) crit.cuda(config.gpu_id) for lm, crit in zip(language_models, crits): optimizer = optim.Adam(lm.parameters()) lm_trainer = LMTrainer(config) lm_trainer.train( lm, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab if lm.vocab_size == len(loader.src.vocab) else None, tgt_vocab=loader.tgt.vocab if lm.vocab_size == len(loader.tgt.vocab) else None, n_epochs=config.lm_n_epochs, ) loader = DataLoader(config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl) from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer dsl_trainer = DSLTrainer(config) optimizers = [ optim.Adam(models[0].parameters()), optim.Adam(models[1].parameters()), ] if opt_weight is not None: for opt, w in zip(optimizers, opt_weight): opt.load_state_dict(w) dsl_trainer.train( models, language_models, crits, optimizers, train_loader=loader.train_iter, valid_loader=loader.valid_iter, vocabs=[loader.src.vocab, loader.tgt.vocab], n_epochs=config.n_epochs + config.dsl_n_epochs, lr_schedulers=None, ) else: loader = DataLoader(config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl) from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer # Encoder's embedding layer input size input_size = len(loader.src.vocab) # Decoder's embedding layer input size and Generator's softmax layer output size output_size = len(loader.tgt.vocab) # Declare the model if config.use_transformer: model = Transformer( input_size, config.hidden_size, output_size, n_splits=config.n_splits, n_enc_blocks=config.n_layers, n_dec_blocks=config.n_layers, dropout_p=config.dropout, ) else: model = Seq2Seq( input_size, config.word_vec_size, # Word embedding vector size config.hidden_size, # LSTM's hidden vector size output_size, n_layers=config.n_layers, # number of layers in LSTM dropout_p=config.dropout # dropout-rate in LSTM ) # Default weight for loss equals to 1, but we don't need to get loss for PAD token. # Thus, set a weight for PAD to zero. loss_weight = torch.ones(output_size) loss_weight[data_loader.PAD] = 0. # Instead of using Cross-Entropy loss, # we can use Negative Log-Likelihood(NLL) loss with log-probability. crit = nn.NLLLoss(weight=loss_weight, reduction='sum') print(model) print(crit) if model_weight is not None: model.load_state_dict(model_weight) # Pass models to GPU device if it is necessary. if config.gpu_id >= 0: model.cuda(config.gpu_id) crit.cuda(config.gpu_id) if config.use_adam: if config.use_transformer: optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(.9, .98)) else: # case of rnn based seq2seq. optimizer = optim.Adam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) if opt_weight is not None and config.use_adam: optimizer.load_state_dict(opt_weight) if config.lr_step > 0: lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[ i for i in range(max(0, config.lr_decay_start - 1), config.n_epochs, config.lr_step) ], gamma=config.lr_gamma) else: lr_scheduler = None print(optimizer) # Start training. This function maybe equivalant to 'fit' function in Keras. mle_trainer = MLETrainer(config) mle_trainer.train( model, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab, tgt_vocab=loader.tgt.vocab, n_epochs=config.n_epochs, lr_scheduler=lr_scheduler, ) if config.rl_n_epochs > 0: optimizer = optim.SGD(model.parameters(), lr=config.rl_lr) from simple_nmt.rl_trainer import MinimumRiskTrainer mrt_trainer = MinimumRiskTrainer(config) mrt_trainer.train( model, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab, tgt_vocab=loader.tgt.vocab, n_epochs=config.rl_n_epochs, )
def main(config, model_weight=None, opt_weight=None): def print_config(config): pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) print_config(config) loader = DataLoader( config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.lm_batch_size, device=-1, max_length=config.max_length, dsl=True, ) src_vocab_size = len(loader.src.vocab) tgt_vocab_size = len(loader.tgt.vocab) language_models, models = get_models(src_vocab_size, tgt_vocab_size, config) crits = get_crits(src_vocab_size, tgt_vocab_size, pad_index=data_loader.PAD) if model_weight is not None: for model, w in zip(models + language_models, model_weight): model.load_state_dict(w) if config.gpu_id >= 0: for lm, seq2seq, crit in zip(language_models, models, crits): lm.cuda(config.gpu_id) seq2seq.cuda(config.gpu_id) crit.cuda(config.gpu_id) for lm, crit in zip(language_models, crits): optimizer = optim.Adam(lm.parameters()) lm_trainer = LMTrainer(config) lm_trainer.train( lm, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab if lm.vocab_size == src_vocab_size else None, tgt_vocab=loader.tgt.vocab if lm.vocab_size == tgt_vocab_size else None, n_epochs=config.lm_n_epochs, ) loader = DataLoader( config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=-1, max_length=config.max_length, dsl=True, ) dsl_trainer = DSLTrainer(config) if config.use_transformer: optimizers = [ custom_optim.RAdam(models[0].parameters(), lr=1e-3), custom_optim.RAdam(models[1].parameters(), lr=1e-3), ] else: optimizers = [ optim.Adam(models[0].parameters()), optim.Adam(models[1].parameters()), ] if config.verbose >= 2: print(language_models) print(models) print(crits) print(optimizers) if opt_weight is not None: for opt, w in zip(optimizers, opt_weight): opt.load_state_dict(w) dsl_trainer.train( models, language_models, crits, optimizers, train_loader=loader.train_iter, valid_loader=loader.valid_iter, vocabs=[loader.src.vocab, loader.tgt.vocab], n_epochs=config.n_epochs, lr_schedulers=None, )