def main(config, model_weight=None, opt_weight=None): print("==== train.py main =====") def print_config(config): import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) print_config(config) if config.dsl: print("==== train.py config.dsl => Dataloader 실행 =====") loader = DataLoader( config.train, config.valid, (config.lang[:12], config.lang[12:]), batch_size=config.lm_batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl, ) from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer print("==== train.py language_model 2개 실행 =====") language_models = [ LanguageModel( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), LanguageModel( len(loader.src.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), ] print("==== train.py language_model 2개 실행 end =====") print("==== train.py models (seq2seq) 2개 실행 =====") models = [ Seq2Seq( len(loader.src.vocab), config.word_vec_size, config.hidden_size, len(loader.tgt.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), Seq2Seq( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, len(loader.src.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), ] print("==== train.py models (seq2seq) 2개 실행 end =====") loss_weights = [ torch.ones(len(loader.tgt.vocab)), torch.ones(len(loader.src.vocab)), ] loss_weights[0][data_loader.PAD] = .0 loss_weights[1][data_loader.PAD] = .0 crits = [ nn.NLLLoss(weight=loss_weights[0], reduction='none'), nn.NLLLoss(weight=loss_weights[1], reduction='none'), ] print(language_models) print(models) print(crits) if model_weight is not None: print("train.py - if model_weight is not None: 에 걸렸다!") for model, w in zip(models + language_models, model_weight): model.load_state_dict(w) print("model의 정체는..? ", model) if config.gpu_id >= 0: for lm, seq2seq, crit in zip(language_models, models, crits): lm.cuda(config.gpu_id) seq2seq.cuda(config.gpu_id) crit.cuda(config.gpu_id) for lm, crit in zip(language_models, crits): print("==== train.py for문 lm 모델 하나씩 실행 =====") optimizer = optim.Adam(lm.parameters()) lm_trainer = LMTrainer(config) lm_trainer.train( lm, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab if lm.vocab_size == len(loader.src.vocab) else None, tgt_vocab=loader.tgt.vocab if lm.vocab_size == len(loader.tgt.vocab) else None, n_epochs=config.lm_n_epochs, ) print("==== train.py for문 lm 모델 하나씩 실행 =====") loader = DataLoader( config.train, config.valid, (config.lang[:12], config.lang[12:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, # dsl=config.dsl ) from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer dsl_trainer = DSLTrainer(config) optimizers = [ optim.Adam(models[0].parameters()), optim.Adam(models[1].parameters()), ] if opt_weight is not None: for opt, w in zip(optimizers, opt_weight): opt.load_state_dict(w) dsl_trainer.train( models, language_models, crits, optimizers, train_loader=loader.train_iter, valid_loader=loader.valid_iter, vocabs=[loader.src.vocab, loader.tgt.vocab], n_epochs=config.n_epochs + config.dsl_n_epochs, lr_schedulers=None, ) else: loader = DataLoader( config.train, config.valid, (config.lang[:12], config.lang[12:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl ) #from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer # Encoder's embedding layer input size input_size = len(loader.src.vocab) # Decoder's embedding layer input size and Generator's softmax layer output size output_size = len(loader.tgt.vocab) # Declare the model # if config.use_transformer: # model = Transformer( # input_size, # config.hidden_size, # output_size, # n_splits=config.n_splits, # n_enc_blocks=config.n_layers, # n_dec_blocks=config.n_layers, # dropout_p=config.dropout, # ) # else: model = Seq2Seq(input_size, config.word_vec_size, # Word embedding vector size config.hidden_size, # LSTM's hidden vector size output_size, n_layers=config.n_layers, # number of layers in LSTM dropout_p=config.dropout # dropout-rate in LSTM ) # Default weight for loss equals to 1, but we don't need to get loss for PAD token. # Thus, set a weight for PAD to zero. loss_weight = torch.ones(output_size) loss_weight[data_loader.PAD] = 0. # Instead of using Cross-Entropy loss, # we can use Negative Log-Likelihood(NLL) loss with log-probability. crit = nn.NLLLoss(weight=loss_weight, reduction='sum' ) print(model) print(crit) if model_weight is not None: model.load_state_dict(model_weight) # Pass models to GPU device if it is necessary. if config.gpu_id >= 0: model.cuda(config.gpu_id) crit.cuda(config.gpu_id) if config.use_adam: if config.use_transformer: # optimizer = optim.Adam(model.parameters(), lr=config.hidden_size**(-.5), betas=(.9, .98)) optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(.9, .98)) else: # case of rnn based seq2seq. optimizer = optim.Adam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) if opt_weight is not None and config.use_adam: optimizer.load_state_dict(opt_weight) if config.use_noam_decay: f = lambda step: min((step + 1)**(-.5), (step + 1) * config.lr_n_warmup_steps**(-1.5)) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=f) else: if config.lr_step > 0: lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[i for i in range(max(0, config.lr_decay_start - 1), (config.init_epoch - 1) + config.n_epochs, config.lr_step)], gamma=config.lr_gamma ) for _ in range(config.init_epoch - 1): lr_scheduler.step() else: lr_scheduler = None print(optimizer)
if saved_data: loader.load_vocab(saved_data['src_vocab'], saved_data['tgt_vocab']) elif config.tgt_vocab_path: loader.load_target_vocab(pickle.load(open(config.tgt_vocab_path, 'rb'))) print(loader.isSrcPremise.vocab.itos, file=sys.stderr) # Encoder's embedding layer input size input_size = len(loader.src.vocab) # Decoder's embedding layer input size and Generator's softmax layer output size output_size = len(loader.tgt.vocab) # Declare the model model = Seq2Seq( input_size, config.word_vec_dim, # Word embedding vector size config.hidden_size, # LSTM's hidden vector size output_size, n_layers=config.n_layers, # number of layers in LSTM dropout_p=config.dropout # dropout-rate in LSTM ) # Default weight for loss equals to 1, but we don't need to get loss for PAD token. # Thus, set a weight for PAD to zero. loss_weight = torch.ones(output_size) loss_weight[data_loader.PAD] = 0. # Instead of using Cross-Entropy loss, we can use Negative Log-Likelihood(NLL) loss with log-probability. criterion = nn.NLLLoss(weight=loss_weight, size_average=False) if not config.pretrain: assert config.reward_mode in [ 'nli', 'bleu', 'combined' ], "the reward mode should be one of ['nli', 'bleu', 'combined']"
# if train_config.use_transformer: # model = Transformer( # input_size, # train_config.hidden_size, # output_size, # n_splits=train_config.n_splits, # n_enc_blocks=train_config.n_layers, # n_dec_blocks=train_config.n_layers, # dropout_p=train_config.dropout, # ) model = Seq2Seq( input_size, train_config.word_vec_size, train_config.hidden_size, output_size, n_layers=train_config.n_layers, dropout_p=train_config.dropout, #search=SingleBeamSearchSpace() ) print("===== 18 model : =====", model) if train_config.dsl: if not is_reverse: print("===== 19 if not is_reverse =====") model.load_state_dict(saved_data['model'][0]) else: print("===== 20 if not is_reverse ELSE=====") model.load_state_dict(saved_data['model'][1]) else: print("===== 21 train_config.dsl ELSE =====")
from simple_nmt.encoder import Encoder from simple_nmt.decoder import Decoder from simple_nmt.seq2seq import Seq2Seq from data_loader import DataLoader #from train import from hyperparams import Hyperparams if __name__ == "__main__": hparams = Hyperparams() cuda = hparams.use_cuda and torch.cuda.is_available() device = torch.device('cuda' if cuda else 'cpu') enc, enc_hidden = Encoder() dec, dec_hidden = Decoder() model = Seq2Seq(enc, dec) model.flatten_parameters() # 일자로 펴 준다 model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=hparams.lr) # loss function의 변수로 criterion 대게 씀. reduction은 loss를 어떻게 넘길지인데, default는 mean이지만 sum이 더 빠르다고 함. 더 정확한 것은 mean # ignore_index는 loss 계산시 무시할 인덱스인데, PADDING 된 것들에는 loss 계산할 필요가 없다. criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) ######### # 데이터 로딩 & 벡터화 # (input(한글문장) & target(영어문장) 로드, 벡터로까지 만드는 과정 필요. (sentencepiece , 임베딩)) #########
_print_config(config) # Load training and validation data set. loader = DataLoader(config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl) if config.dsl: # In case of the dual supervised training mode is turn-on. # Because we must train both models in same time, we need to declare both models. models = [ Seq2Seq(len(loader.src.vocab), config.word_vec_dim, config.hidden_size, len(loader.tgt.vocab), n_layers=config.n_layers, dropout_p=config.dropout), Seq2Seq(len(loader.tgt.vocab), config.word_vec_dim, config.hidden_size, len(loader.src.vocab), n_layers=config.n_layers, dropout_p=config.dropout) ] # Because we also need to get P(src) and P(tgt), we need language models consist of LSTM. language_models = [ LanguageModel(len(loader.tgt.vocab), config.word_vec_dim, config.hidden_size, n_layers=config.n_layers,
def main(config, model_weight=None, opt_weight=None): def print_config(config): import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) print_config(config) if config.dsl: loader = DataLoader( config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.lm_batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl, ) from simple_nmt.lm_trainer import LanguageModelTrainer as LMTrainer language_models = [ LanguageModel( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), LanguageModel( len(loader.src.vocab), config.word_vec_size, config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, ), ] models = [ Seq2Seq( len(loader.src.vocab), config.word_vec_size, config.hidden_size, len(loader.tgt.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), Seq2Seq( len(loader.tgt.vocab), config.word_vec_size, config.hidden_size, len(loader.src.vocab), n_layers=config.n_layers, dropout_p=config.dropout, ), ] loss_weights = [ torch.ones(len(loader.tgt.vocab)), torch.ones(len(loader.src.vocab)), ] loss_weights[0][data_loader.PAD] = .0 loss_weights[1][data_loader.PAD] = .0 crits = [ nn.NLLLoss(weight=loss_weights[0], reduction='none'), nn.NLLLoss(weight=loss_weights[1], reduction='none'), ] print(language_models) print(models) print(crits) if model_weight is not None: for model, w in zip(models + language_models, model_weight): model.load_state_dict(w) if config.gpu_id >= 0: for lm, seq2seq, crit in zip(language_models, models, crits): lm.cuda(config.gpu_id) seq2seq.cuda(config.gpu_id) crit.cuda(config.gpu_id) for lm, crit in zip(language_models, crits): optimizer = optim.Adam(lm.parameters()) lm_trainer = LMTrainer(config) lm_trainer.train( lm, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab if lm.vocab_size == len(loader.src.vocab) else None, tgt_vocab=loader.tgt.vocab if lm.vocab_size == len(loader.tgt.vocab) else None, n_epochs=config.lm_n_epochs, ) loader = DataLoader(config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl) from simple_nmt.dual_trainer import DualSupervisedTrainer as DSLTrainer dsl_trainer = DSLTrainer(config) optimizers = [ optim.Adam(models[0].parameters()), optim.Adam(models[1].parameters()), ] if opt_weight is not None: for opt, w in zip(optimizers, opt_weight): opt.load_state_dict(w) dsl_trainer.train( models, language_models, crits, optimizers, train_loader=loader.train_iter, valid_loader=loader.valid_iter, vocabs=[loader.src.vocab, loader.tgt.vocab], n_epochs=config.n_epochs + config.dsl_n_epochs, lr_schedulers=None, ) else: loader = DataLoader(config.train, config.valid, (config.lang[:2], config.lang[-2:]), batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length, dsl=config.dsl) from simple_nmt.trainer import MaximumLikelihoodEstimationTrainer as MLETrainer # Encoder's embedding layer input size input_size = len(loader.src.vocab) # Decoder's embedding layer input size and Generator's softmax layer output size output_size = len(loader.tgt.vocab) # Declare the model if config.use_transformer: model = Transformer( input_size, config.hidden_size, output_size, n_splits=config.n_splits, n_enc_blocks=config.n_layers, n_dec_blocks=config.n_layers, dropout_p=config.dropout, ) else: model = Seq2Seq( input_size, config.word_vec_size, # Word embedding vector size config.hidden_size, # LSTM's hidden vector size output_size, n_layers=config.n_layers, # number of layers in LSTM dropout_p=config.dropout # dropout-rate in LSTM ) # Default weight for loss equals to 1, but we don't need to get loss for PAD token. # Thus, set a weight for PAD to zero. loss_weight = torch.ones(output_size) loss_weight[data_loader.PAD] = 0. # Instead of using Cross-Entropy loss, # we can use Negative Log-Likelihood(NLL) loss with log-probability. crit = nn.NLLLoss(weight=loss_weight, reduction='sum') print(model) print(crit) if model_weight is not None: model.load_state_dict(model_weight) # Pass models to GPU device if it is necessary. if config.gpu_id >= 0: model.cuda(config.gpu_id) crit.cuda(config.gpu_id) if config.use_adam: if config.use_transformer: optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(.9, .98)) else: # case of rnn based seq2seq. optimizer = optim.Adam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) if opt_weight is not None and config.use_adam: optimizer.load_state_dict(opt_weight) if config.lr_step > 0: lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[ i for i in range(max(0, config.lr_decay_start - 1), config.n_epochs, config.lr_step) ], gamma=config.lr_gamma) else: lr_scheduler = None print(optimizer) # Start training. This function maybe equivalant to 'fit' function in Keras. mle_trainer = MLETrainer(config) mle_trainer.train( model, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab, tgt_vocab=loader.tgt.vocab, n_epochs=config.n_epochs, lr_scheduler=lr_scheduler, ) if config.rl_n_epochs > 0: optimizer = optim.SGD(model.parameters(), lr=config.rl_lr) from simple_nmt.rl_trainer import MinimumRiskTrainer mrt_trainer = MinimumRiskTrainer(config) mrt_trainer.train( model, crit, optimizer, train_loader=loader.train_iter, valid_loader=loader.valid_iter, src_vocab=loader.src.vocab, tgt_vocab=loader.tgt.vocab, n_epochs=config.rl_n_epochs, )
else: # Load vocabularies from the model. src_vocab = saved_data['src_vocab'] tgt_vocab = saved_data['tgt_vocab'] # Initialize dataloader, but we don't need to read training & test corpus. # What we need is just load vocabularies from the previously trained model. loader = DataLoader() loader.load_vocab(src_vocab, tgt_vocab) input_size = len(loader.src.vocab) output_size = len(loader.tgt.vocab) # Declare sequence-to-sequence model. model = Seq2Seq(input_size, train_config.word_vec_dim, train_config.hidden_size, output_size, n_layers=train_config.n_layers, dropout_p=train_config.dropout) if train_config.dsl: if not is_reverse: model.load_state_dict(saved_data['models'][0]) else: model.load_state_dict(saved_data['models'][1]) else: model.load_state_dict( saved_data['model'] ) # Load weight parameters from the trained model. model.eval( ) # We need to turn-on the evaluation mode, which turns off all drop-outs.