def train_bvae(BVAE,optim,train_data_generator,test_data_generator,check_point_dir,epoch_num,writer,output_file_path): for epoch in range(epoch_num): #running_loss=0.0 for i,batch in enumerate(train_data_generator,0): batch=batch.type(torch.cuda.FloatTensor).cuda() reconstruction,mean,std=BVAE(batch) loss=BVAE.compute_loss(batch,reconstruction,mean,std) writer.add_scalar("Loss/train", loss, epoch) #running_loss+=loss.item() optim.zero_grad() loss.backward() optim.step() """if(i%20==19): with open(output_file_path,"a") as ofile: ofile.write('[%d, %5d] loss: %.3f \n' % (epoch + 1, i + 1, running_loss /20)) ofile.close() running_loss = 0.0""" if(epoch%100==99): #print("saving!") state = { 'checkpoint_num': epoch, 'state_dict': BVAE.state_dict(), 'optimizer': optim.state_dict(), } path=str(epoch+1)+".pt" saves=os.path.join(check_point_dir,path) torch.save(state,saves) for i,batch in enumerate(test_data_generator,0): batch=batch.type(torch.cuda.FloatTensor).cuda() reconstruction,mean,std=BVAE(batch) loss=BVAE.compute_loss(batch,reconstruction,mean,std) writer.add_scalar("Loss/test", loss, epoch) writer.flush()
def update_model_state(self, optim): """ :return: dictionary of model parameters to be saved """ return_dict = self.states return_dict.update({'state_dict': self.model.state_dict(), 'optimizer': optim.state_dict()}) return return_dict
def save_model(self, model, model2, optim, optim2, epoch, batch_size): self.epoch = epoch self.batch_size = batch_size try: os.stat(self.args['save_path']) except: os.mkdir(self.args['save_path']) save_dir = os.path.join(self.args['save_path'], 'models/') # Create directory if necessary try: os.stat(save_dir) except: os.mkdir(save_dir) filename = 'gnn.pt' path = os.path.join(save_dir, filename) data = { 'model': model.state_dict(), 'model2': model2.state_dict(), 'optimizer': optim.state_dict(), 'optimizer2': optim2.state_dict(), 'epoch': epoch + 1, 'loss': self.loss_train, 'overlap': self.overlap_train } torch.save(data, path) # if batch_size > 1: # save_freq = 10000 # elif batch_size == 1: # save_freq = 40000 # if (epoch>0 and epoch%save_freq==0): torch.save(data, save_dir + 'gnn_epoch_{}.pt'.format(epoch)) print('Model Saved.')
def save_model(model, filename='trained_model', optimizers=None, savepoint=None, use_datetime=False, **kwargs): if optimizers is not None: if not isinstance(optimizers, list): optimizers = [optimizers] else: optimizers = [] for k in kwargs: filename += '_' + k + '_%f' % kwargs[k] if '.' not in filename: filename += '.pth' path = savepoint create_folder(path) path = join(path, filename) save_dict = dict(model.state_dict()) for i, optim in enumerate(optimizers): save_dict['optim_%i' % i] = optim.state_dict() torch.save(save_dict, path)
def save(name): torch.save( { 'opt': optim.state_dict(), 'opt_f': feature_optim.state_dict(), 'net': combined_model.state_dict() }, name)
def save_model_best_acc(acc, best_acc, net, optim, epoch, save_path, filename): """Save a model and its optimizer if its accuracy is better than the saved one Args: acc (int): performance of the model to save best_acc (int): saved model best performance net (nn.Module): model to save optim (torch.optim): optimizer of the model to save epoch (int): number of epoch the model were trained save_path (str): path on disk where to save the model to filename (str): filename on disk Returns: best_acc (int): the saved model best performance """ if acc > best_acc: print('Saving ...') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, 'optim': optim.state_dict() } torch.save(state, os.path.join(save_path, filename)) best_acc = acc return best_acc
def save_model(model, optim, epoch, path): torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'opimizer': optim.state_dict() }, path)
def save_model(net, optim, epoch, path): state_dict = net.state_dict() torch.save({ 'epoch': epoch + 1, 'state_dict': state_dict, 'optimizer': optim.state_dict(), }, path)
def train(): acc = 0 for epoche in range(epoches): print("epoche:", epoche, " start") image, label = None, None for (image, label) in data_loader_train: # image = torchvision.transforms.functional.resize(image) res = lenet(image.cuda()) loss = criterion(res, label.cuda()) print("epoche :", epoche, " current acc=", acc, " custom loss=", loss.item()) loss_list.append(loss.item()) optim.zero_grad() loss.backward() optim.step() acc = batch_test(image, label) if epoche % 5 == 0: torch.save( { 'epoch': epoche + 1, 'state_dict': lenet.state_dict(), 'optimizer': optim.state_dict() }, PATH + str(epoche)) loss_np = np.array(loss_list) np.save("loss.npy", loss_np) print(loss_np.shape) if epoche % val_every_epoche == 0: val_list.append(validation()) np.save("val.npy", np.array(val_list))
def train(model, training_data, validation_data, test_data, optim, vocab_size, max_tensor_length): val_accrs = [] if saved: val_accrs.append(savedModel['config']['maxacc']) test_accrs = [] for i in range(n_epoch): start = time.time() train_accr, train_loss = train_epoch(model, training_data, optim) trainWriter.write(str(train_accr) + "\n") # trainWriter.write("\n") trainWriter.flush() lossWriter.write(str(train_loss) + '\n') lossWriter.flush() print('\n - (Training) accuracy: {accu:3.3f} %, ' 'elapse: {elapse:3.3f} min'.format(accu=100 * train_accr, elapse=(time.time() - start) / 60)) start = time.time() val_accr = eval_epoch(model, validation_data) validWriter.write(str(val_accr) + "\n") # validWriter.write("\n") validWriter.flush() print('\n - (Validation) accuracy: {accu:3.3f} %, ' 'elapse: {elapse:3.3f} min'.format(accu=100 * val_accr, elapse=(time.time() - start) / 60)) val_accrs.append(val_accr) # print("Accuracies so far: ", val_accrs) start = time.time() test_accr = test_epoch(model, test_data) testWriter.write(str(test_accr) + "\n") # validWriter.write("\n") testWriter.flush() print('\n - (Test) accuracy: {accu:3.3f} %, ' 'elapse: {elapse:3.3f} min'.format(accu=100 * test_accr, elapse=(time.time() - start) / 60)) test_accrs.append(test_accr) # print("Accuracies so far: ", val_accrs) model_state_dict = model.state_dict() config = { 'max_src_seq_len': max_tensor_length, 'vocab_size': vocab_size, 'maxacc': max(val_accrs), 'dropout': p_dropout } checkpoint = { 'model': model_state_dict, 'epoch': i, 'optimizer': optim.state_dict(), 'config': config } model_name = os.path.join(model_folder, "TypeModel.ckpt") if val_accr >= max(val_accrs): print("Save model at epoch ", i) torch.save(checkpoint, model_name)
def save_model(model, optim, epoch, loss, path): torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optim.state_dict(), 'loss': loss }, path)
def save_model(self, model, optim, epoch, best_score, model_path): """save model to local file""" torch.save({ 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optim.state_dict(), 'epoch': epoch, 'best_score': best_score }, model_path)
def save_checkpoint(model: nn.Module, optim: optimizer.Optimizer, epoch_id: int, step: int, best_score: float): torch.save({ _MODEL_STATE_DICT: model.state_dict(), _OPTIMIZER_STATE_DICT: optim.state_dict(), _EPOCH: epoch_id, _STEP: step, _BEST_SCORE: best_score }, "./result/fr_en/checkpoint.tar")
def save_model(model, optim, logs, ckpt_dir, filename): file_path = os.path.join(ckpt_dir, filename) state = {'model': model.state_dict(), 'optim': optim.state_dict(), 'logs': tuple(logs), 'steps': len(logs)} torch.save(state, file_path) return
def save_branchyNet(self, path): dict_models_branches = {} dict_models_branches["model_main_state_dict"] = self.main.state_dict() for i, (model, optim) in enumerate(zip(self.models, self.optimizers), 1): dict_models_branches["model_branch_%s_state_dict"%(i)] = model.state_dict() dict_models_branches["optim_branch_%s_state_dict"%(i)] = optim.state_dict() torch.save(dict_models_branches, path)
def save_state(net, optim): model_states = {'VAE_net': net.state_dict()} optim_states = {'VAE_optim': optim.state_dict()} states = {'model_states': model_states, 'optim_states': optim_states} file_path = "MI_estimator" with open(file_path, mode='wb+') as f: save(states, f)
def loop(data_loader, num_epochs=1000, save_every=1000, train_losses=[], test_losses=[], train_cnts=[], test_cnts=[], dummy=False): print("starting training loop for data with %s batches"%data_loader.num_batches) st = time.time() if len(train_losses): # resume cnt from last save last_save = train_cnts[-1] cnt = train_cnts[-1] else: last_save = 0 cnt = 0 v_xn, v_yn = data_loader.validation_data() v_x = Variable(torch.FloatTensor(np.swapaxes(v_xn,1,0))).to(DEVICE) v_y = Variable(torch.FloatTensor(np.swapaxes(v_yn,1,0))).to(DEVICE) if dummy: print("WARNING DUMMMY Validation") v_x, v_y = get_dummy_data(v_x, v_y) for e in range(num_epochs): ecnt = 0 tst = round((time.time()-st)/60., 0) if not e%1 and e>0: print("starting epoch %s, %s mins, loss %s, seen %s, last save at %s" %(e, tst, train_losses[-1], cnt, last_save)) batch_loss = [] for b in range(data_loader.num_batches): x, y = data_loader.next_batch() x = Variable(torch.FloatTensor(np.swapaxes(x,1,0))).to(DEVICE) y = Variable(torch.FloatTensor(np.swapaxes(y,1,0))).to(DEVICE) if dummy: y_pred, loss = train(v_x, v_y, validation=False) print('DUMMY test loss', cnt, loss) else: y_pred, loss = train(x.to(DEVICE),y.to(DEVICE),validation=False) train_cnts.append(cnt) train_losses.append(loss) if cnt%100: valy_pred, val_mean_loss = train(v_x,v_y,validation=True) test_losses.append(val_mean_loss) test_cnts.append(cnt) if cnt-last_save >= save_every: last_save = cnt # find test loss print('epoch: {} saving after example {} train loss {} test loss {}'.format(e,cnt,loss,val_mean_loss)) state = { 'train_cnts':train_cnts, 'train_losses':train_losses, 'test_cnts': test_cnts, 'test_losses':test_losses, 'state_dict':lstm.state_dict(), 'optimizer':optim.state_dict(), } basename = os.path.join(savedir, '%s_%015d'%(model_save_name,cnt)) plot_losses(train_cnts, train_losses, test_cnts, test_losses, name=basename+'_loss.png') save_checkpoint(state, filename=basename+'.pkl') cnt+= x.shape[1] ecnt+= x.shape[1]
def save_branches(self, best_loss, best_acc,path): dict_models_branches = {} for i, (model, optim) in enumerate(zip(self.models, self.optimizers), 1): dict_models_branches["model_branch_%s_state_dict"%(i)] = model.state_dict() dict_models_branches["optim_branch_%s_state_dict"%(i)] = optim.state_dict() dict_models_branches["best_loss"] = best_loss dict_models_branches["best_acc"] = best_acc torch.save(dict_models_branches, path)
def save_model(epoch, model, optim, filename): head, tail = os.path.split(filename) if not os.path.exists(head): os.makedirs(head) torch.save( { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optim.state_dict() }, filename)
def test(epoch): global best_prec1 model.eval() loss = 0 pred_y = [] true_y = [] correct = 0 ema_correct = 0 with torch.no_grad(): for batch_idx, (data, target) in enumerate(target_loader): data, target = data.cuda(), target.cuda(non_blocking=True) data = data.unsqueeze(1) output = model(data) target = target.long() loss += criterion_cel(output, target).item() # sum up batch loss pred = output.max( 1, keepdim=True)[1] # get the index of the max log-probability for i in range(len(pred)): pred_y.append(pred[i].item()) true_y.append(target[i].item()) correct += pred.eq(target.view_as(pred)).sum().item() loss /= len(target_loader.dataset) utils.cal_acc(true_y, pred_y, NUM_CLASSES) print( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( loss, correct, len(target_loader.dataset), 100. * correct / len(target_loader.dataset))) prec1 = 100. * correct / len(target_loader.dataset) if epoch % 1 == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) utils.save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best) if is_best: global best_gt_y global best_pred_y best_gt_y = true_y best_pred_y = pred_y
def save_checkpoint(cr, sr, optim, epoch): path = 'checkpoint/L{}_QF{}.pth'.format(PENALTY, QF) if not os.path.exists("checkpoint/"): os.makedirs("checkpoint/") torch.save( { 'epoch': epoch, 'cr': cr.state_dict(), 'sr': sr.state_dict(), 'optim': optim.state_dict() }, path) print("Checkpoint saved to {}".format(path)) return path
def save_model(path, epoch, iteration, model, optim, optim_method, batch_size, mis_class): torch.save( { "model_type": "".join(model.__class__.__name__.split("_")), 'epoch': epoch, 'iteration': iteration, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optim.state_dict(), 'optim_method': optim_method, 'batch_size': batch_size, 'mis_class': mis_class }, path)
def save_checkpoint(args, model, optim, iter): cpt = { 'iter': iter, 'model_state': model.state_dict(), 'optim_state': optim.state_dict() } cpt_name = get_checkpoint_name(args, iter) cpt_link = get_checkpoint_name(args, None) torch.save(cpt, cpt_name) if os.path.exists(cpt_link): os.remove(cpt_link) os.symlink(os.path.abspath(cpt_name), cpt_link)
def build_optim(_model, train_args, checkpoint=None): saved_optimizer_state_dict = None if checkpoint: optim = checkpoint['optim'] saved_optimizer_state_dict = optim.state_dict() else: optim = AdamW(_model.parameters(), lr=train_args.lr, eps=1e-8) if train_args.train_from is not None: optim.load_state_dict(saved_optimizer_state_dict) if train_args.device != 'cpu': for state in optim.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda(device=train_args.device) return optim
def checkpoint(model, model_store_folder, epoch_num, model_name, period, frames,\ bestLoss, loss, auc, optim, scheduler = None): print('Saving checkpoints...') save = { 'epoch': epoch_num, 'state_dict': model.state_dict(), 'period': period, 'frames': frames, 'best_loss': bestLoss, 'loss': loss, 'opt_dict': optim.state_dict(), 'scheduler_dict': scheduler.state_dict() if scheduler != None else None, } suffix_latest = '{}.pth'.format(model_name) torch.save(save, '{}/{}'.format(model_store_folder, suffix_latest))
def log_and_save_epoch(self, model, optim, epoch, loss): epoch = epoch + 1 self.logger.info(f'####################') self.logger.info(f'COMPLETED EPOCH: {epoch}') self.logger.info(f'####################') # Log document weights - check for sparsity doc_weights = model.doc_weights.weight proportions = F.softmax(doc_weights, dim=1) avg_s_score = np.mean( [utils.get_sparsity_score(p) for p in proportions]) self.logger.info(f'DOCUMENT PROPORTIIONS:\n {proportions}') self.logger.info(f'AVERAGE SPARSITY SCORE: {avg_s_score}\n') self.writer.add_scalar('avg_doc_prop_sparsity_score', avg_s_score, epoch) _, max_indices = torch.max(proportions, dim=1) max_indices = list(max_indices.cpu().numpy()) max_counter = Counter(max_indices) self.logger.info( f'MAXIMUM TOPICS AT INDICES, FREQUENCY: {max_counter}\n') self.logger.info( f'MOST FREQUENCT MAX INDICES: {max_counter.most_common(10)}\n') if epoch % self.args.save_step == 0: # Visualize document embeddings self.writer.add_embedding( model.get_doc_vectors(), global_step=epoch, tag=f'de_epoch_{epoch}', ) # Save checkpoint self.logger.info(f'Beginning to save checkpoint') self.saver.save_checkpoint({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optim.state_dict(), 'loss': loss, }) self.logger.info(f'Finished saving checkpoint')
def save_model(acc, net, optim, epoch, save_path, filename): """Save a model and its optimizer Args: acc (int): performance of the model to save net (nn.Module): model to save optim (torch.optim): optimizer of the model to save epoch (int): number of epoch the model were trained save_path (str): path on disk where to save the model to filename (str): filename on disk """ print('Saving ...') state = { 'net': net.state_dict(), 'acc': acc, 'epoch': epoch, 'optim': optim.state_dict() } torch.save(state, os.path.join(save_path, filename))
def train_network(start_epoch, epochs, optim, model, train_loader, val_loader, criterion, mixup, device, dtype, batch_size, log_interval, csv_logger, save_path, claimed_acc1, claimed_acc5, best_test, local_rank, child): my_range = range if child else trange for epoch in my_range(start_epoch, epochs + 1): if not isinstance(optim.scheduler, CyclicLR) and not isinstance( optim.scheduler, CosineLR): optim.scheduler_step() train_loss, train_accuracy1, train_accuracy5, = train( model, train_loader, mixup, epoch, optim, criterion, device, dtype, batch_size, log_interval, child) test_loss, test_accuracy1, test_accuracy5 = test( model, val_loader, criterion, device, dtype, child) csv_logger.write({ 'epoch': epoch + 1, 'val_error1': 1 - test_accuracy1, 'val_error5': 1 - test_accuracy5, 'val_loss': test_loss, 'train_error1': 1 - train_accuracy1, 'train_error5': 1 - train_accuracy5, 'train_loss': train_loss }) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_test, 'optimizer': optim.state_dict() }, test_accuracy1 > best_test, filepath=save_path, local_rank=local_rank) csv_logger.plot_progress(claimed_acc1=claimed_acc1, claimed_acc5=claimed_acc5) if test_accuracy1 > best_test: best_test = test_accuracy1 csv_logger.write_text('Best accuracy is {:.2f}% top-1'.format(best_test * 100.))
def train_loop(self, num_batches=10): ecnt = 0 batch_loss = [] for b in range(data_loader.num_batches): xnp, ynp = self.data_loader.next_batch() x = Variable(torch.FloatTensor(xnp)) y = Variable(torch.FloatTensor(ynp)) y_pred, loss = train(x,y,validation=False) train_cnts.append(cnt) train_losses.append(loss) if cnt%100: valy_pred, val_mean_loss = train(v_x,v_y,validation=True) test_losses.append(val_mean_loss) test_cnts.append(cnt) if cnt-last_save >= save_every: last_save = cnt # find test loss print('epoch: {} saving after example {} train loss {} test loss {}'.format(e,cnt,loss,val_mean_loss)) state = { 'train_cnts':train_cnts, 'train_losses':train_losses, 'test_cnts': test_cnts, 'test_losses':test_losses, 'state_dict':lstm.state_dict(), 'optimizer':optim.state_dict(), } basename = os.path.join(savedir, '%s_%015d'%(model_save_name,cnt)) n = 500 plot_losses(rolling_average(train_cnts, n), rolling_average(train_losses, n), rolling_average(test_cnts, n), rolling_average(test_losses, n), name=basename+'_loss.png') save_checkpoint(state, filename=basename+'.pkl') cnt+= x.shape[1] ecnt+= x.shape[1] loop(data_loader, save_every=save_every, num_epochs=args.num_epochs, train_losses=train_losses, test_losses=test_losses, train_cnts=train_cnts, test_cnts=test_cnts, dummy=args.dummy)
def save_checkpoint(acc, model, optim, epoch, index=False): # Save checkpoint. print('Saving..') if isinstance(model, nn.DataParallel): model = model.module state = { 'net': model.state_dict(), 'optimizer': optim.state_dict(), 'acc': acc, 'epoch': epoch, 'rng_state': torch.get_rng_state() } if index: ckpt_name = 'ckpt_epoch' + str(epoch) + '_' + str(SEED) + '.t7' else: ckpt_name = 'ckpt_' + str(SEED) + '.t7' ckpt_path = os.path.join(LOGDIR, ckpt_name) torch.save(state, ckpt_path)