def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) # self.optimizer = Adam(params) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss
def setup_train(self, model_file_path=None): """模型初始化或加载、初始化迭代次数、损失、优化器""" # 初始化模型 self.model = Model(model_file_path) # 模型参数的列表 params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr # lr_coverage和lr二选一 # 定义优化器 self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) # 初始化迭代次数和损失 start_iter, start_loss = 0, 0 # 如果传入的已存在的模型路径,加载模型继续训练 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if USE_CUDA: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(DEVICE) return start_iter, start_loss
def __init__(self, *args, **kwargs): super(WeightedHolE, self).__init__() # self.add_hyperparam('rparam', kwargs.pop('rparam', 0.0)) self.learning_rate = kwargs.get('lr', _DEF_LEARNING_RATE) entity_dim, _, relation_dim = args[0] embed_dim = args[1] self._max_epochs = kwargs.get('max_epochs', _DEF_MAX_EPOCHS) init_relations = kwargs.get('init_relations') if init_relations is not None: self.R = nn.Parameter(init_relations) else: self.R = nn.Parameter(torch.FloatTensor(relation_dim, embed_dim).uniform_(-.1,.1)) self.R.my_name = 'R' self.R.grad = torch.zeros_like(self.R) pretrained_ent = kwargs.get('pretrained_entities') if pretrained_ent is not None: self.E = nn.Parameter(pretrained_ent) else: self.E = nn.Parameter(torch.FloatTensor(entitiy_dim, embed_dim).uniform_(-.1,.1)) self.E.my_name = 'E' self.E.grad = torch.zeros_like(self.E) self.loss_function = nn.SoftMarginLoss(reduction='sum') self.optim = Adagrad(list(self.parameters()), lr=self.learning_rate)
def __init__(self, model, args): self.encoder = model.encoder self.decoder = model.decoder self.lr = { "encoder": args.encoder_learning_rate, "decoder": args.decoder_learning_rate } self.warmup_steps = { "encoder": args.encoder_warmup_steps, "decoder": args.decoder_warmup_steps, } if args.optimizer == "adam": self.optimizers = { "encoder": Adam(model.encoder.parameters(), lr=self.lr["encoder"]), "decoder": Adam(model.decoder.parameters(), lr=self.lr["decoder"]), } elif args.optimizer == "adagrad": self.optimizers = { "encoder": Adagrad(model.encoder.parameters(), lr=self.lr["encoder"]), "decoder": Adagrad(model.decoder.parameters(), lr=self.lr["decoder"]), } else: raise NotImplementedError self._step = 0
def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) #print("params : ",params) #print("params collection is completed....") initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 #### Loading state where the training stopped earlier use that to train for future epoches #### if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) ###### Making into GPU/server accessable Variables ##### if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss
def fit(self, data_loader, print_freq=1000, num_epochs=10): ''' fit to the data Parameters ---------- data_loader : DataLoader if enumerated, it returns array-like object of shape (batch_size, length), where each element corresponds to word index. print_freq : int how frequent to print loss num_epochs : int the number of epochs ''' def repackage_hidden(h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(repackage_hidden(v) for v in h) if self.padding_idx is None: criterion = nn.CrossEntropyLoss() else: criterion = nn.CrossEntropyLoss(ignore_index=self.padding_idx) optimizer = Adagrad(self.parameters()) i = 0 running_loss = 0 for epoch in range(num_epochs): for each_idx, each_batch in enumerate(data_loader): batch_var = Variable(each_batch, requires_grad=False) if self.use_gpu: batch_var = batch_var.cuda() try: pred_batch = self.forward(batch_var[:, :-1]) except: import ipdb; ipdb.set_trace() pred_batch.contiguous() batch_var.contiguous() tgt = batch_var[:, :-1] tgt.contiguous() loss = criterion(pred_batch.view(-1, self.vocab_size), tgt.view(-1)) loss.backward() optimizer.step() self.init_hidden() # print statistics running_loss += loss.data[0] i += 1 if i % print_freq == print_freq-1: print('epoch: {}\t total examples: {}\t loss: {}'.format( epoch + 1, (i + 1) * self.batch_size, running_loss / print_freq)) running_loss = 0.0 print('Finished Training')
def __fit(self, dataset: Tensor, model: VAEBase): it = DataLoader(dataset, batch_size=self.batch_size, shuffle=True) dur = self.noepochs * ceil(len(dataset) / self.batch_size) history = History(zeros(dur), zeros(dur), zeros(dur), zeros(dur), zeros(dur)) hooks = CombinedHook() hooks.add(LossHook) hooks.add(RekonstruktHook, dataset[:10, :]) hooks.add(LatentSamplerHook, self.nolatents) hooks.prehook(self, history) self.model = model self.noinputs = model.noinputs self.opt = Adagrad(self.model.parameters(), lr=0.01) # See Section 5. for epoch in range(self.noepochs): for i, x in enumerate(it): self.opt.zero_grad() # Apply model in the following steps: # (a) encode datapoint into latent space; # (b) sample points from latent space; # (c) decode sampled points from latent space. mu, logsigma2 = self.model.encode(x) z = self.model.sample(mu, logsigma2) X = self.model.decode(z) # Estimate KL-divergence and reconstruction error (RE). kl = self.model.kl(mu, logsigma2) re = self.model.re(x, X) # Do error backpropagation. loss = kl + re loss.backward() self.opt.step() # Aggregation runtime statistics. history.append(epoch=epoch, batch=i, kl=float(kl / self.batch_size), re=float(re / self.batch_size)) if i % self.show_every == 0: hooks.hook(self, history) # Print status before exit. hooks.posthook(self, history) # Return itself for calls chaining. return self
def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, grad_clip=0): Adagrad.__init__(self, params, lr=lr, lr_decay=lr_decay, weight_decay=weight_decay, initial_accumulator_value=initial_accumulator_value) self.defaults['grad_clip'] = grad_clip self.param_groups[0].setdefault('grad_clip', grad_clip)
def create_optimiser(params, args): name = args.optim.lower() lr = args.lr wd = args.weight_decay if name == 'adam': b = [args.beta1, args.beta2] if wd > 0: from torch.optim import AdamW opt = AdamW(params, lr, betas=b, weight_decay=wd) else: from torch.optim import Adam opt = Adam(params, lr, betas=b) elif name == 'sgd': from torch.optim import SGD m = args.momentum opt = SGD(params, lr, momentum=m, nesterov=True, weight_decay=wd) elif name == 'rmsprop': from torch.optim import RMSprop m = args.momentum a = args.alpha opt = RMSprop(params, lr, momentum=m, alpha=a, weight_decay=wd) elif name == 'adagrad': from torch.optim import Adagrad ld = args.lr_decay opt = Adagrad(params, lr, lr_decay=ld, weight_decay=wd) else: raise ValueError('optim must be one of adam, sgd, rmsprop, adagrad') return opt
def _set_optimizer(self, lr, opt_conf): """optimizerとしてself._optimizerの指示の元、インスタンスを立てるメソッド """ if self._optimizer in adam: return Adam([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in sgd: return SGD([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in rmsprop: return RMSprop([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in adadelta: return Adadelta([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in adagrad: return Adagrad([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) else: raise ValueError(f'optimizer={self._optimizer}は用意されていません')
def setup(self, config): model = Model(config) checkpoint = None if config.train_from != '': logging('Train from %s' % config.train_from) checkpoint = torch.load(config.train_from, map_location='cpu') model.load_state_dict(checkpoint['model']) self.step = checkpoint['step'] self.model = model.to(device) self.optimizer = Adagrad(model.parameters(), lr=config.learning_rate, initial_accumulator_value=config.initial_acc) if checkpoint is not None: self.optimizer.load_state_dict(checkpoint['optimizer'])
def setup_train(self, model_file_path=None, emb_v_path=None, emb_list_path=None, vocab=None, log=None): self.model = Model(model_file_path) if model_file_path is None: set_embedding(self.model, emb_v_path=emb_v_path, emb_list_path=emb_list_path, vocab=self.vocab, use_cuda=use_cuda, log=log) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr if config.mode == 'MLE': self.optimizer = Adagrad(params, lr=0.15, initial_accumulator_value=0.1) else: self.optimizer = Adam(params, lr=initial_lr) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] return start_iter, start_loss
def setup(self, config): self.model = Model(config).to(config['device']) self.optimizer = Adagrad(self.model.parameters(), lr=config['learning_rate'], initial_accumulator_value=0.1) # self.optimizer = Adam(self.model.parameters(),lr = config['learning_rate'],betas = config['betas']) checkpoint = None if config[ 'train_from'] != '': # Counter在两次mostCommon间, 相同频率的元素可能以不同的次序输出...! logging('Train from %s' % config['train_from']) checkpoint = torch.load(config['train_from'], map_location='cpu') self.model.load_state_dict(checkpoint['model']) self.step = checkpoint['step'] self.vocab = checkpoint['vocab'] self.optimizer.load_state_dict(checkpoint['optimizer'])
def train(self, train_data, tester_val, tester_tst): head, tail, rela = train_data # useful information related to cache n_train = len(head) if self.args.optim=='adam' or self.args.optim=='Adam': self.optimizer = Adam(self.model.parameters(), lr=self.args.lr) elif self.args.optim=='adagrad' or self.args.optim=='Adagrad': self.optimizer = Adagrad(self.model.parameters(), lr=self.args.lr) else: self.optimizer = SGD(self.model.parameters(), lr=self.args.lr) scheduler = ExponentialLR(self.optimizer, self.args.decay_rate) n_epoch = self.args.n_epoch n_batch = self.args.n_batch best_mrr = 0 # used for counting repeated triplets for margin based loss for epoch in range(n_epoch): start = time.time() self.epoch = epoch rand_idx = torch.randperm(n_train) head = head[rand_idx].cuda() tail = tail[rand_idx].cuda() rela = rela[rand_idx].cuda() epoch_loss = 0 for h, t, r in batch_by_size(n_batch, head, tail, rela, n_sample=n_train): self.model.zero_grad() loss = self.model.forward(h, t, r) loss += self.args.lamb * self.model.regul loss.backward() self.optimizer.step() self.prox_operator() epoch_loss += loss.data.cpu().numpy() self.time_tot += time.time() - start scheduler.step() if (epoch+1) % self.args.epoch_per_test == 0: # output performance valid_mrr, valid_mr, valid_10 = tester_val() test_mrr, test_mr, test_10 = tester_tst() out_str = '%.4f\t\t%.4f\t%.4f\t%.4f\n'%(epoch + 1, test_mr, test_mrr, test_10) # output the best performance info if valid_mrr > best_mrr: best_mrr = valid_mrr best_str = out_str if best_mrr < self.args.thres: print('\tearly stopped in Epoch:{}, best_mrr:{}'.format(epoch+1, best_mrr), self.model.struct) return best_str return best_mrr, best_str
def fit(model, num_epochs, trainloader, valloader): criterion = binary_cross_entropy(input_size, target_size) optimizer = Adagrad(model.parameters(), lr=lr,lr_decay=lr/num_epochs) scheduler = OneCycleLR(optimizer, lr_range=(lr,1.), num_steps=1000) print("epoch\ttrain loss\tvalid loss\taccuracy") for epoch in range(num_epochs): train_loss = train(trainloader, model, criterion, optimizer, scheduler) valid_loss, valid_acc = validate(valloader, model, criterion) print(f"{epoch}\t{train_loss:.5f}\t\t{valid_loss:.5f}\t\t{valid_acc:.3f}")
def build(self, params): from torch.optim import Adagrad return Adagrad( params, lr=self.lr, lr_decay=self.lr_decay, weight_decay=self.weight_decay, initial_accumulator_value=self.initial_accumulator_value, eps=self.eps)
def get_optimizer(optim_name, fixed_cnn, args): """Return the asked optimizer""" optimizers = { 'sgd': SGD(params=adjust_weight_decay(fixed_cnn, args.l2_reg), lr=args.lr, momentum=0.9, nesterov=True), 'adam': Adam(params=adjust_weight_decay(fixed_cnn, args.l2_reg), lr=args.lr), 'adagrad': Adagrad(params=adjust_weight_decay(fixed_cnn, args.l2_reg), lr=args.lr), 'rmsprop': RMSprop(params=adjust_weight_decay(fixed_cnn, args.l2_reg), lr=args.lr) } return optimizers.get(optim_name)
def dispatch_optimizer(model, args): if args.optimizer == 'SGD': return SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) if args.optimizer == 'Adam': return Adam(model.parameters(), lr=args.learning_rate) if args.optimizer == 'AdamW': return AdamW(model.parameters(), lr=args.learning_rate) if args.optimizer == 'RMSprop': return RMSprop(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) if args.optimizer == 'Adagrad': return Adagrad(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
def configure_optimizers(self): args = self.config.optimizer.args lr = args.lr lr_init_accum = args.lr_init_accum params = self.parameters() optimizer = Adagrad( params=params, lr=lr, initial_accumulator_value=lr_init_accum, ) return optimizer
def train_model(model, lr, epochs, train_loader, val_loader, patience): optimizer = Adagrad(model.parameters(), lr) criterion = nn.MSELoss() best_rmse = 100 rounds_no_imporve = 0 for epoch in range(epochs): for users, items, x, y in train_loader: y_pred = model(users, items, x) loss = criterion(y_pred.reshape(-1), y) optimizer.zero_grad() loss.backward() optimizer.step() logging.info('Last train loss: {0:.3f}'.format( loss.detach().cpu().numpy().tolist())) with torch.no_grad(): errors = np.array([]) for users, items, x, y in val_loader: y_pred = model(users, items, x) group_errors = (y_pred - y).reshape(-1).cpu().numpy() errors = np.concatenate([errors, group_errors]) rmse = (errors**2).mean()**0.5 logging.info('Test RMSE: {0:.3f}'.format(rmse)) if rmse < best_rmse: best_rmse = rmse rounds_no_imporve = 0 else: rounds_no_imporve += 1 if rounds_no_imporve >= patience: return model return model
def run_train(): datainfo, vocabs = set_up_data() train_sampler = RandomSampler() criterion = SummLoss(config=config, padding_idx=vocabs.to_index(PAD_TOKEN)) model = CGSum(config, vocab=vocabs) model.to(device) initial_lr = config.lr logger.info(f"learning rate = {initial_lr}") optimizer = Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) train_loader = datainfo.datasets["train"] valid_loader = datainfo.datasets["dev"] callbacks = [ TrainCallback(config, patience=10), FitlogCallback(), LRDecayCallback(optimizer.param_groups, steps=args.weight_decay_step) ] trainer = Trainer(model=model, train_data=train_loader, optimizer=optimizer, loss=criterion, batch_size=config.batch_size, check_code_level=-1, sampler=train_sampler, n_epochs=config.n_epochs, print_every=100, dev_data=valid_loader, update_every=args.update_every, metrics=FastRougeMetric( pred='prediction', art_oovs='article_oovs', abstract_sentences='abstract_sentences', config=config, vocab=datainfo.vocabs["vocab"]), metric_key="rouge-l-f", validate_every=args.validate_every * args.update_every, save_path=None, callbacks=callbacks, use_tqdm=True) logger.info("-" * 5 + "start training" + "-" * 5) traininfo = trainer.train(load_best_model=True) logger.info(' | end of Train | time: {:5.2f}s | '.format( traininfo["seconds"])) logger.info('[INFO] best eval model in epoch %d and iter %d', traininfo["best_epoch"], traininfo["best_step"])
def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) print('somma parametri: ' + str(sum([p.numel() for p in params if p.requires_grad]))) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) if config.is_mixed_precision_training: [ self.model.encoder, self.model.decoder, self.model.reduce_state ], self.optimizer = amp.initialize([ self.model.encoder, self.model.decoder, self.model.reduce_state ], self.optimizer, loss_scale=1.0, opt_level="O2") start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss
def make_optimizer(params: Iterable[torch.nn.Parameter], is_emb: bool) -> Optimizer: params = list(params) if len(params) == 0: optimizer = DummyOptimizer() elif is_emb: optimizer = RowAdagrad(params, lr=config.lr) else: if config.relation_lr is not None: lr = config.relation_lr else: lr = config.lr optimizer = Adagrad(params, lr=lr) optimizer.share_memory() return optimizer
def __init__(self, model, args, train_dataset, eval_dataset, test_dataset, vocab, is_train=True): self.model = model #.to(args.device) self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.test_dataset = test_dataset self.is_train = is_train self.vocab = vocab self.params = list(model.encoder.parameters()) + \ list(model.decoder.parameters()) + list(model.reduce_state.parameters()) initial_lr = args.lr_coverage if args.is_coverage else args.lr self.optimizer = Adagrad( self.params, lr=initial_lr, initial_accumulator_value=args.adagrad_init_acc)
def test_train_lcwa(self) -> None: """Test that LCWA training does not fail.""" loop = LCWATrainingLoop( model=self.model, optimizer=Adagrad(params=self.model.get_grad_params(), lr=0.001), **(self.training_loop_kwargs or {}), ) losses = self._safe_train_loop( loop, num_epochs=self.train_num_epochs, batch_size=self.train_batch_size, sampler='default', ) self.assertIsInstance(losses, list)
def adagrad(parameters): # pick defaults if not ("lr_decay" in parameters["optimizer"]): parameters["optimizer"]["lr_decay"] = 0 if not ("eps" in parameters["optimizer"]): parameters["optimizer"]["eps"] = 1e-6 if not ("weight_decay" in parameters["optimizer"]): parameters["optimizer"]["weight_decay"] = 0 return Adagrad( parameters["model_parameters"], lr=parameters["learning_rate"], lr_decay=parameters["optimizer"]["lr_decay"], eps=parameters["optimizer"]["eps"], weight_decay=parameters["optimizer"]["weight_decay"], )
def get_optimizer(net): if args.optimizer == 'sgd': optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'nesterov': optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay,nesterov=True) elif args.optimizer == 'adagrad': optimizer = Adagrad(net.parameters(), weight_decay=args.weight_decay) elif args.optimizer == 'adadelta': optimizer = Adadelta(net.parameters(), weight_decay=args.weight_decay) elif args.optimizer == 'adam': optimizer = Adam(net.parameters(), weight_decay=args.weight_decay) else: raise Exception('Invalid optimizer specified.') return optimizer
def testAccuracy_AsyncAdagrad(self): for sparse in (True, False): # testing that Adagrad = AsyncAdagrad with 1 process NE = 10000 golden_model = nn.Embedding(NE, 100, sparse=sparse) test_model = nn.Embedding(NE, 100, sparse=sparse) test_model.load_state_dict(golden_model.state_dict()) golden_optimizer = Adagrad(golden_model.parameters()) self._stress_optimizer(golden_model, golden_optimizer, num_processes=1) test_optimizer = AsyncAdagrad(test_model.parameters()) self._stress_optimizer(test_model, test_optimizer, num_processes=1) # This fails for Adagrad because it's not stable self.assertTensorEqual(golden_model.weight, test_model.weight)
def demo_pytorch_vae_mnist(hidden_sizes=[200, 200], latent_dim=5, distribution_type='bernoulli', minibatch_size=20, checkpoints=100, n_epochs=20): cp = Checkpoints(checkpoints) model = VAEModel( encoder=make_mlp_encoder(visible_dim=784, hidden_sizes=hidden_sizes, latent_dim=latent_dim), decoder=make_mlp_decoder(latent_dim=latent_dim, hidden_sizes=hidden_sizes, visible_dim=784, dist_type=distribution_type), latent_dim=latent_dim, ) # optimizer = Adam(params = model.parameters()) # optimizer = RMSprop(params = model.parameters()) # optimizer = Adamax(params = model.parameters()) optimizer = Adagrad(params=model.parameters()) # optimizer = SGD(lr=0.001, params = model.parameters()) train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])), batch_size=minibatch_size, shuffle=True) for epoch in range(n_epochs): for batch_idx, (x, y) in enumerate(train_loader): epoch_pt = epoch + batch_idx / len(train_loader) optimizer.zero_grad() loss = -model.elbo(x.flatten(1)).sum() loss.backward() optimizer.step() rate = measure_global_rate('training') if cp(): print(f'Mean Rate at Epoch {epoch_pt:.2g}: {rate:.3g}iter/s') z_samples = model.prior().sample((64, )) x_dist = model.decode(z_samples) dbplot(x_dist.mean.reshape(-1, 28, 28), 'Sample Means', title=f'Sample Means at epoch {epoch_pt:.2g}')
def run_train(config): train_dir, model_dir = initial_dir('train', config) config.train_path = train_dir config.model_path = model_dir print_config(config, train_dir) datainfo = set_up_data('train', config) train_sampler = BucketSampler(batch_size=config.batch_size, seq_len_field_name='enc_len') criterion = MyLoss(config=config, padding_idx=datainfo.vocabs["train"].to_index(PAD_TOKEN)) model = Model(vocab=datainfo.vocabs["train"], config=config) params = list(model.encoder.parameters()) + list(model.decoder.parameters()) + \ list(model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) train_loader = datainfo.datasets["train"] valid_loader = datainfo.datasets["dev"] summary_writer = tf.compat.v1.summary.FileWriter(train_dir) trainer = Trainer(model=model, train_data=train_loader, optimizer=optimizer, loss=criterion, batch_size=config.batch_size, check_code_level=-1, n_epochs=config.n_epochs, print_every=100, dev_data=valid_loader, metrics=FastRougeMetric(pred='prediction', art_oovs='article_oovs', abstract_sentences='abstract_sentences', config=config, vocab=datainfo.vocabs["train"]), metric_key="rouge-l-f", validate_every=-1, save_path=model_dir, callbacks=[TrainCallback(config, summary_writer, patience=10)], use_tqdm=False, device=config.visible_gpu) logger.info("-" * 5 + "start training" + "-" * 5) traininfo = trainer.train(load_best_model=True) logger.info(' | end of Train | time: {:5.2f}s | '.format(traininfo["seconds"])) logger.info('[INFO] best eval model in epoch %d and iter %d', traininfo["best_epoch"], traininfo["best_step"]) logger.info(traininfo["best_eval"]) bestmodel_save_path = os.path.join(config.model_path, 'bestmodel.pkl') # this is where checkpoints of best models are saved state = { 'encoder_state_dict': model.encoder.state_dict(), 'decoder_state_dict': model.decoder.state_dict(), 'reduce_state_dict': model.reduce_state.state_dict() } torch.save(state, bestmodel_save_path) # 不是作为形参传入到Trainer里面的么,怎么里面的model变化会影响到外面的? logger.info('[INFO] Saving eval best model to %s', bestmodel_save_path)