def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
def test(self, args, model, dataloader, scheduler, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=1.0, save_dir='runs/exp'): # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint(self.model_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) # model = resume_checkpoint.model model.load_state_dict(resume_checkpoint.model) self.optimizer = None self.args = args model.args = args start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step print('Resume from ', latest_checkpoint_path) print('start_epoch : ', start_epoch) print('step : ', step) start_epoch = 1 step = 0 else: print('Please Resume !') raise self._test_epoches(dataloader, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio, clip=args.clip, save_dir=save_dir, args=args) return model
def test(self, args, model, dataloader, resume=False, save_dir='runs/exp'): # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.model_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model.load_state_dict(resume_checkpoint.model) self.optimizer = None self.args = args model.args = args start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step print('Resume from ', latest_checkpoint_path) print('start_epoch : ', start_epoch) print('step : ', step) start_epoch = 1 step = 0 else: print('Please Resume !') raise if args.mode == 'bleu_t1t2': print('Eval on bleu_t1&t2 !') self._test_epoches_t1t2(dataloader, model, step, save_dir=save_dir, args=args) else: print('Please choose t1 | t2 mode !') raise return model
def train(self, D, G, optimizer_D, optimizer_G, train, val=None, num_epoch=200, resume=False, opt=None): start_epoch = 0 if resume: cp = Checkpoint.load( Checkpoint.get_latest_checkpoint('./experiment/gan')) self.model = cp.model start_epoch = cp.epoch + 1 for epoch in range(start_epoch, num_epoch): logging.info('Epoch[%d] CycleGAN train' % epoch) train_iter, val_iter = torchtext.data.BucketIterator.splits( (train, val), batch_sizes=(1, 64), device=opt.device, sort_key=lambda x: len(x.real_a), repeat=False) self.train_epoch(D, G, optimizer_D, optimizer_G, train_iter)
def train(self, train, val=None, num_epoch=200, resume=False): start_epoch = 0 if resume: cp = Checkpoint.load( Checkpoint.get_latest_checkpoint('./experiment/gan')) self.model = cp.model start_epoch = cp.epoch + 1 for epoch in range(start_epoch, num_epoch): logging.info('Epoch[%d] CycleGAN train' % epoch) train_iter, val_iter = torchtext.data.BucketIterator.splits( (train, val), batch_sizes=(1, 64), device=self.device, sort_key=lambda x: len(x.real_a), repeat=False) self._train_epoch(train_iter) self.evaluate(val_iter) Checkpoint(model=self.model, optimizer=None, epoch=epoch, step=0, input_vocab=None, output_vocab=None).save('./experiment/gan')
def test_get_latest_checkpoint(self, mock_listdir): mock_listdir.return_value = ['2017_05_22_09_47_26', '2017_05_22_09_47_31', '2017_05_23_10_47_29'] latest_checkpoint = Checkpoint.get_latest_checkpoint(self.EXP_DIR) self.assertEquals(latest_checkpoint, os.path.join(self.EXP_DIR, 'checkpoints/2017_05_23_10_47_29'))
def load_checkpoint(checkpoint_name, expt_dir): if checkpoint_name is not None: logging.info("loading checkpoint from {}".format( os.path.join(expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, checkpoint_name))) checkpoint_path = os.path.join(expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, checkpoint_name) else: checkpoint_path = Checkpoint.get_latest_checkpoint(expt_dir) return Checkpoint.load(checkpoint_path)
def train(self, encoder, decoder, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, is_training=0): if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) decoder = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( decoder.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(decoder.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, encoder, decoder, num_epochs, start_epoch, step, dev_data=dev_data, is_training=is_training) return decoder
def train(self, model, data, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=0): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on num_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint(self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) self.optimizer.optimizer = resume_optim.__class__(model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self._train_epoches(data, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio) return model
def train(self, args, model, dataloader, scheduler, num_epochs=5, resume=False, dev_data=None, optimizer=None, teacher_forcing_ratio=1.0, save_dir='runs/exp'): # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint(self.model_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) # model = resume_checkpoint.model model.load_state_dict(resume_checkpoint.model) self.optimizer = optimizer self.args = args model.args = args start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step print('Resume from ', latest_checkpoint_path) print('start_epoch : ', start_epoch) print('step : ', step) if args.adam: self.optimizer = torch.optim.Adam(model.parameters()) optimizer.load_state_dict(resume_checkpoint.optimizer) elif args.sgd: self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) optimizer.load_state_dict(resume_checkpoint.optimizer) self.scheduler = lr_scheduler.StepLR(self.optimizer, args.decay_steps, gamma=args.decay_factor) # for i in range(step): # self.scheduler.step() self.scheduler._step_count = step for param_group in self.optimizer.param_groups: print('learning rate', param_group['lr'], step) else: start_epoch = 1 step = 0 if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=5) self.optimizer = optimizer self.scheduler = scheduler # self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) if args.only_sql: self._train_sql_epoches(dataloader, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio, clip=args.clip, save_dir=save_dir, args=args) else: self._train_epoches(dataloader, model, num_epochs, start_epoch, step, dev_data=dev_data, teacher_forcing_ratio=teacher_forcing_ratio, clip=args.clip, save_dir=save_dir, args=args) return model
def _train_epoches(self, data, model, n_epochs, batch_size, resume, dev_data=None, teacher_forcing_ratio=0): start = time.time() print_loss_total = 0 # Reset every print_every steps_per_epoch = data.num_batches(batch_size) total_steps = steps_per_epoch * n_epochs # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer.set_parameters(model.parameters()) self.optimizer.load_state_dict( resume_checkpoint.optimizer_state_dict) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 self.optimizer.set_parameters(model.parameters()) for epoch in range(start_epoch, n_epochs + 1): data.shuffle(self.random_seed) batch_generator = data.make_batches(batch_size) # consuming seen batches from previous training for _ in range((epoch - 1) * steps_per_epoch, step): next(batch_generator) model.train(True) for batch in batch_generator: step += 1 input_variables = batch[0] target_variables = batch[1] loss = self._train_batch(input_variables, target_variables, model, teacher_forcing_ratio) # Record average loss print_loss_total += loss if step % self.print_every == 0: print_loss_avg = print_loss_total / (self.print_every) print_loss_total = 0 log_msg = 'Time elapsed: %s, Progress: %d%%, Train %s: %.4f' % ( pretty_interval(start), float(step) / total_steps * 100, self.loss.name, print_loss_avg) self.logger.info(log_msg) # Checkpoint if step % self.checkpoint_every == 0 or step == total_steps: Checkpoint( model=model, optimizer_state_dict=self.optimizer.state_dict(), epoch=epoch, step=step, input_vocab=data.input_vocab, output_vocab=data.output_vocab).save(self.expt_dir) log_msg = "Finished epoch {0}".format(epoch) if dev_data is not None: dev_loss = self.evaluator.evaluate(model, dev_data) self.optimizer.update(dev_loss, epoch) log_msg += ", Dev %s: %.4f" % (self.loss.name, dev_loss) model.train(mode=True) self.logger.info(log_msg)
def load_model(exp_path): cp = Checkpoint.load(Checkpoint.get_latest_checkpoint(exp_path)) model = cp.model return model
src_vocab=input_vocab, tgt_vocab=output_vocab) # Prepare model hidden_size = 128 encoder = EncoderRNN(input_vocab, dataset.src_max_len, hidden_size) decoder = DecoderRNN(output_vocab, dataset.tgt_max_len, hidden_size, dropout_p=0.2, use_attention=True) seq2seq = Seq2seq(encoder, decoder) if opt.resume: print("resuming training") latest_checkpoint = Checkpoint.get_latest_checkpoint(opt.expt_dir) seq2seq.load(latest_checkpoint) else: for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Prepare loss weight = torch.ones(output_vocab.get_vocab_size()) mask = output_vocab.MASK_token_id loss = Perplexity(weight, mask) if torch.cuda.is_available(): seq2seq.cuda() loss.cuda() # train
parser = argparse.ArgumentParser() parser.add_argument('--train_path', action='store', dest='train_path', help='path to train data') parser.add_argument('--test_path', action='store', dest='test_path', help='path to test data') parser.add_argument('--checkpoint', action='store', dest='checkpoint', help='path to checkpoint') opt = parser.parse_args() latest_check_point = Checkpoint.get_latest_checkpoint(opt.checkpoint) checkpoint = Checkpoint.load(latest_check_point) input_vocab = checkpoint.input_vocab output_vocab = checkpoint.output_vocab model = checkpoint.model optimizer = checkpoint.optimizer weight = torch.ones(len(output_vocab)) pad = output_vocab.stoi['<pad>'] loss = NLLLoss(weight, pad) batch_size = 1 print(model) train_file = opt.train_path test_file = opt.test_path set_num = get_set_num(train_file)
print('Initializing dataset') train = torchtext.data.TabularDataset(path=opt.train_path, format='tsv', fields=[('src', src), ('tgt', tgt), ('beh', beh)], filter_pred=len_filter) dev = torchtext.data.TabularDataset(path=opt.dev_path, format='tsv', fields=[('src', src), ('tgt', tgt)], filter_pred=len_filter) if not os.path.exists(opt.ckpt_dir): os.makedirs(opt.ckpt_dir) if opt.resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint(opt.ckpt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) src.vocab = resume_checkpoint.input_vocab tgt.vocab = resume_checkpoint.output_vocab else: print('Building vocab') #src.build_vocab(train, max_size=50000) #tgt.build_vocab(train, max_size=opt.vocab_size, vectors='glove.840B.300d') if hidden_size == 300: vectors = 'glove.42B.300d' elif hidden_size == 100: vectors = 'glove.6B.100d' else: vectors = None tgt.build_vocab(train, max_size=vocab_size, vectors=vectors)
def train(self, model, data, round1=2, round2=10, norm_epochs=3, class_epochs=3, resume=False, dev_data=None, test_data=None, optimizer=None, teacher_forcing_ratio=0, lr=0.003): """ Run training for a given model. Args: model (seq2seq.models): model to run training on, if `resume=True`, it would be overwritten by the model loaded from the latest checkpoint. data (seq2seq.dataset.dataset.Dataset): dataset object to train on norm_epochs (int, optional): number of epochs to run (default 5) resume(bool, optional): resume training with the latest checkpoint, (default False) dev_data (seq2seq.dataset.dataset.Dataset, optional): dev Dataset (default None) optimizer (seq2seq.optim.Optimizer, optional): optimizer for training (default: Optimizer(pytorch.optim.Adam, max_grad_norm=5)) teacher_forcing_ratio (float, optional): teaching forcing ratio (default 0) Returns: model (seq2seq.models): trained model. """ # If training is set to resume if resume: latest_checkpoint_path = Checkpoint.get_latest_checkpoint( self.expt_dir) resume_checkpoint = Checkpoint.load(latest_checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer # A walk around to set optimizing parameters properly resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) defaults.pop('initial_lr', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch step = resume_checkpoint.step else: start_epoch = 1 step = 0 # norm_parameters = list(map(id, model.decoder.parameters())) # class_parameters = list(map(id, model.classification.parameters())) # base_params = filter(lambda p: id(p) not in class_parameters, model.parameters()) # self.logger.info(norm_parameters) # self.logger.info(class_parameters) # self.logger.info(base_params) # ignored_params = list(map(id, model.encoder.elmo._scalar_mixes[0].parameters())) # base_params = filter(lambda p: id(p) not in ignored_params, model.parameters()) if optimizer is None: # optimizer = Optimizer(optim.Adam([{'params': base_params}, # {'params': model.encoder.elmo._scalar_mixes[0].parameters(), 'lr':1e-2}], # lr=lr, weight_decay=1e-4), max_grad_norm=5) optimizer = Optimizer(optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4), max_grad_norm=5) self.optimizer = optimizer self.logger.info("Optimizer: %s, Scheduler: %s" % (self.optimizer.optimizer, self.optimizer.scheduler)) self.logger.info( "{} rounds of per training data, {} epochs of per round, Starting....." .format(round1, norm_epochs)) for i in range(round1): self.logger.info("Round: {}".format(i)) self._pre_train_epochs(data[0], model, norm_epochs, 1, 0, dev_data=dev_data[0], teacher_forcing_ratio=teacher_forcing_ratio) self._train_epoches(data[1], model, class_epochs, 1, 0, dev_data=dev_data[1], test_data=test_data, teacher_forcing_ratio=teacher_forcing_ratio) self.logger.info( "{} rounds of per training data, 1 epoch of per round, Starting......" .format(round2)) for i in range(round2): self.logger.info("Round : {}".format(i)) self._pre_train_epochs(data[0], model, 1, 1, 0, dev_data=dev_data[0], teacher_forcing_ratio=teacher_forcing_ratio) self._train_epoches(data[1], model, 1, 1, 0, dev_data=dev_data[1], test_data=test_data, teacher_forcing_ratio=teacher_forcing_ratio) self.logger.info('best_fb_f1: {}, best_tw_f1: {}'.format( self.best_fb_f1, self.best_tw_f1)) return model