vutils.save_image(G(save_noise).detach(), fixed_p, normalize=True) num_info = { 'Discriminator loss': torch.mean(torch.FloatTensor(D_losses)), 'Generator loss': torch.mean(torch.FloatTensor(G_losses)) } fake_to_show = G(save_noise).detach() #tensorboard logging writer.add_scalars('Loss', num_info, epoch) writer.add_image('Fake Samples', fake_to_show[0].cpu()) train_hist['per_epoch_ptimes'].append(per_epoch_ptime) if epoch % 30 == 0: fid_score = fid_model.compute_fid(real_image, G_result) print("FID score", fid_score) writer.add_scalar('FID Score', fid_score, epoch) end_time = time.time() total_ptime = end_time - start_time train_hist['total_ptime'].append(total_ptime) print("Avg one epoch ptime: %.2f, total %d epochs ptime: %.2f" % (torch.mean(torch.FloatTensor( train_hist['per_epoch_ptimes'])), train_epoch, total_ptime)) writer.close() with open(report_dir + 'train_hist.pkl', 'wb') as f: pickle.dump(train_hist, f) show_train_hist(train_hist, save=True, path=report_dir + 'train_hist.png') writer.close()
def train(epochs, batch_size, lr, loss_fn, data_dir): param_cuda = torch.cuda.is_available() #check training starting time start_time = time.time() G, D, G_opt, D_opt = initialize(mean=0.0, std=0.02, lr=lr) #train_hist dict will store the losses of every epoch train_hist = {} train_hist['D_model_mean_losses'] = [] train_hist['G_model_mean_losses'] = [] train_hist['per_epoch_ptimes'] = [] train_hist['total_ptime'] = [] #folder for saving the images if not os.path.isdir('GAN_results'): os.mkdir('GAN_results') for epoch in range(epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, epochs)) epoch_start_time = time.time() #One epoch of trainning over all the dataset train_hist = epoch_train(G=G, D=D, G_opt=G_opt, D_opt=D_opt, batch_size=batch_size, lr=lr, loss_fn=loss_fn, data_dir=data_dir, train_hist=train_hist) epoch_end_time = time.time() per_epoch_ptime = epoch_end_time - epoch_start_time #print progress information for every epoch: print("iteration number "+str(epoch)) print('[%d/%d] - ptime: %.2f, loss_d: %.3f, loss_g: %.3f' % ((epoch + 1), epochs, per_epoch_ptime, torch.mean(torch.FloatTensor(train_hist['D_model_mean_losses'])), torch.mean(torch.FloatTensor(train_hist['G_model_mean_losses'])))) #Save weights utils.save_checkpoint({'epoch': epoch + 1, 'D_model_state_dict': D.state_dict(), 'G_model_state_dict': G.state_dict(), 'D_optim_dict': D_opt.state_dict(), 'G_optim_dict': G_opt.state_dict()}, is_best=False, checkpoint = 'GAN_results/') #Generate and save pictures for every epoch: p = 'GAN_results/result_epoch_' + str(epoch + 1) + '.png' utils.show_result(param_cuda, G, (epoch+1), p, save=True) #add epoch time to the training history train_hist['per_epoch_ptimes'].append(per_epoch_ptime) end_time = time.time() total_ptime = end_time - start_time train_hist['total_ptime'].append(total_ptime) print("Avg per epoch ptime: %.2f, total %d epochs ptime: %.2f" % (torch.mean(torch.FloatTensor(train_hist['per_epoch_ptimes'])), epochs, total_ptime)) print("Training finish!... save learned parameters") #plot training history utils.show_train_hist(train_hist, save=True, path= 'GAN_results/_train_hist.png')
f7.close() f8.close() f9.close() train_hist['D_losses'].append(np.mean(D_losses)) train_hist['G_losses'].append(np.mean(G_losses)) train_hist['per_epoch_ptimes'].append(per_epoch_ptime) print('Saving Model Epoch: ', epoch + 1) saver.save(sess, './model', write_meta_graph=False) print('Model Saved\n') end_time = time.time() total_ptime = end_time - start_time train_hist['total_ptime'].append(total_ptime) print('Avg per epoch ptime: %.2f, total %d epochs ptime: %.2f' % (np.mean(train_hist['per_epoch_ptimes']), train_epoch, total_ptime)) print("Training finish!... save training results") with open(root + model + 'train_hist.pkl', 'wb') as f: pickle.dump(train_hist, f) for i in range(10): images = [] for e in range(train_epoch): img_name = root + 'Results/' + model + str(e + 1) + str(i) + '.png' images.append(imageio.imread(img_name)) imageio.mimsave(root + model + 'generation_animation' + str(i) + '.gif', images, fps=5) utils.show_train_hist(train_hist, save=True, path=root + model + 'train_hist.png') sess.close()
def train_process(config): root_dir=config.root_dir input_dir=os.path.join(root_dir+"data/") version=config.version root_dir=make_folder(root_dir, version) model_dir=make_folder(root_dir, "model/") report_dir=make_folder(root_dir,root_dir+"report/") output_dir=make_folder(root_dir,"output/epoch/") res_dir=make_folder(root_dir,root_dir+"res/") inp_width, inp_height, inp_channels,train_split=config.inp_width, config.inp_height, config.inp_channels,config.train_split # model parameters lrG=config.lrG lrD=config.lrD beta1=config.beta1 beta2=config.beta2 L1_lambda=config.L1_lambda ngf=config.ngf ndf=config.ndf dataset=localImageDataset(root_dir, inp_width, inp_height, inp_channels) print("Length of dataset: ",len(dataset)) train_size=int(train_split*len(dataset)) val_size=len(dataset)-train_size train_dataset, val_dataset=torch.utils.data.random_split(dataset,[train_size,val_size]) train_dataloader=torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=4) num_batches=len(train_dataloader) val_dataloader=torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=4) #from model import generator, discriminator #import utils device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") G = generator(ngf) D = discriminator(ndf) BCE_loss=nn.BCELoss().cuda() L1_loss=nn.L1Loss().cuda() G_optimizer=optim.Adam(G.parameters(),lr=lrG,betas=(beta1,beta2)) D_optimizer=optim.Adam(D.parameters(),lr=lrD,betas=(beta1,beta2)) start_time=time.time() epoch_start=0 epoch_end=epoch_start+train_epoch #loss if(os.path.isfile(model_dir+'generator_param.pkl') and os.path.isfile(model_dir+'discriminator_param.pkl')): G_checkpoint=torch.load(model_dir+'generator_param.pkl',map_location=device) D_checkpoint=torch.load(model_dir+'discriminator_param.pkl',map_location=device) G.load_state_dict(G_checkpoint['model_state_dict']) D.load_state_dict(D_checkpoint['model_state_dict']) G.to(device) D.to(device) G.train() D.train() G_optimizer.load_state_dict(G_checkpoint['optimizer_state_dict']) D_optimizer.load_state_dict(D_checkpoint['optimizer_state_dict']) train_hist=G_checkpoint['train_hist'] epoch_start=G_checkpoint['epoch'] epoch_end=epoch_start+train_epoch else: G.weight_init(mean=0.0, std=0.02) D.weight_init(mean=0.0, std=0.02) G.to(device) D.to(device) G.train() D.train() G_optimizer=optim.Adam(G.parameters(),lr=lrG,betas=(beta1,beta2)) D_optimizer=optim.Adam(D.parameters(),lr=lrD,betas=(beta1,beta2)) train_hist={} train_hist['D_losses']=[] train_hist['G_losses']=[] train_hist['per_epoch_ptimes']=[] train_hist['total_ptime']=[] epoch_end=epoch_start+train_epoch device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") G = generator(ngf) D = discriminator(ndf) G_optimizer=optim.Adam(G.parameters(),lr=lrG,betas=(beta1,beta2)) D_optimizer=optim.Adam(D.parameters(),lr=lrD,betas=(beta1,beta2)) #loss BCE_loss=nn.BCELoss().to(device) L1_loss=nn.L1Loss().to(device) if(os.path.isfile(model_dir+'generator_param.pkl') and os.path.isfile(model_dir+'discriminator_param.pkl')): G_checkpoint=torch.load(model_dir+'generator_param.pkl',map_location=device) D_checkpoint=torch.load(model_dir+'discriminator_param.pkl',map_location=device) G.load_state_dict(G_checkpoint['model_state_dict']) D.load_state_dict(D_checkpoint['model_state_dict']) G.to(device) D.to(device) G.train() D.train() #D.eval() G_optimizer.load_state_dict(G_checkpoint['optimizer_state_dict']) D_optimizer.load_state_dict(D_checkpoint['optimizer_state_dict']) train_hist=G_checkpoint['train_hist'] epoch_start=G_checkpoint['epoch'] epoch_end=epoch_start+train_epoch else: print("Previous model not found. Restarting train process...") G.weight_init(mean=0.0, std=0.02) D.weight_init(mean=0.0, std=0.02) G.to(device) D.to(device) G.train() D.train() G_optimizer=optim.Adam(G.parameters(),lr=lrG,betas=(beta1,beta2)) D_optimizer=optim.Adam(D.parameters(),lr=lrD,betas=(beta1,beta2)) train_hist={} train_hist['D_losses']=[] train_hist['G_losses']=[] train_hist['per_epoch_ptimes']=[] train_hist['total_ptime']=[] epoch_start=0 epoch_end=epoch_start+train_epoch for epoch in range(epoch_start,epoch_end): D_losses=[] G_losses=[] epoch_start_time=time.time() num_iter=0 for text_image, inp_image in train_dataloader: inp_image,text_image=Variable(inp_image.to(device)),Variable(text_image.to(device)) D.zero_grad() D_result=D(inp_image,text_image).squeeze() D_real_loss=BCE_loss(D_result,Variable(torch.ones(D_result.size()).to(device))) G_result=G(inp_image) D_result=D(inp_image,G_result).squeeze() D_fake_loss=BCE_loss(D_result,Variable(torch.zeros(D_result.size()).to(device))) D_train_loss=(D_real_loss +D_fake_loss)*0.5 D_train_loss.backward() D_optimizer.step() train_hist['D_losses'].append(float(D_train_loss)) D_losses.append(float(D_train_loss)) D_losses.append(float(0)) #training generator G.zero_grad() G_result=G(inp_image) D_result=D(text_image,G_result).squeeze() G_train_loss=BCE_loss(D_result, Variable(torch.ones(D_result.size()).to(device))) + L1_lambda*L1_loss(G_result,text_image) G_train_loss.backward() G_optimizer.step() train_hist['G_losses'].append(float(G_train_loss)) G_losses.append(float(G_train_loss)) num_iter+=1 torch.save({ 'epoch': epoch, 'model_state_dict': G.state_dict(), 'optimizer_state_dict': G_optimizer.state_dict(), 'train_hist': train_hist }, model_dir+'generator_param.pkl') torch.save({ 'model_state_dict': D.state_dict(), 'optimizer_state_dict': D_optimizer.state_dict(), },model_dir+'discriminator_param.pkl') epoch_end_time=time.time() per_epoch_ptime=epoch_end_time-epoch_start_time print('[%d/%d] - ptime: %.2f, loss_d: %.3f, loss_g: %.3f' % ((epoch + 1), train_epoch, per_epoch_ptime, torch.mean(torch.FloatTensor(D_losses)), torch.mean(torch.FloatTensor(G_losses)))) fixed_p = output_dir + str(epoch + 1) + '.png' #show_result(G, Variable(inp_image.to(device), volatile=True), text_image.cpu(), (epoch+1), save=True, path=fixed_p) train_hist['per_epoch_ptimes'].append(per_epoch_ptime) end_time=time.time() total_ptime=end_time-start_time train_hist['total_ptime'].append(total_ptime) print("Avg one epoch ptime: %.2f, total %d epochs ptime: %.2f" % (torch.mean(torch.FloatTensor(train_hist['per_epoch_ptimes'])), train_epoch, total_ptime)) with open(report_dir+'train_hist.pkl', 'wb') as f: pickle.dump(train_hist, f) show_train_hist(train_hist, save=True, path=report_dir + 'train_hist.png')
def train_and_evaluate(param_cuda, dataset, G_model, D_model, G_optimizer, D_optimizer, loss_fn, train_loader, train_epoch, model_dir, restore_file=None): '''Train the model and evaluate every epoch''' # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(model_dir, restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, D_model, G_model, D_optimizer, G_optimizer) # check training starting time start_time = time.time() # here we are going to save the losses of every epoch train_hist = {} train_hist['D_model_mean_losses'] = [] train_hist['G_model_mean_losses'] = [] train_hist['per_epoch_ptimes'] = [] train_hist['total_ptime'] = [] # folder for saving the images if not os.path.isdir(dataset + '_results'): os.mkdir(dataset + '_results') for epoch in range(train_epoch): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, train_epoch)) epoch_start_time = time.time() # compute number of batches in one epoch (one full pass over the training set) train_hist = train(G_model, D_model, G_optimizer, D_optimizer, loss_fn, train_loader, param_cuda, train_hist) epoch_end_time = time.time() per_epoch_ptime = epoch_end_time - epoch_start_time #prints in every epoch: print("iteration number " + str(epoch)) print( '[%d/%d] - ptime: %.2f, loss_d: %.3f, loss_g: %.3f' % ((epoch + 1), train_epoch, per_epoch_ptime, torch.mean(torch.FloatTensor(train_hist['D_model_mean_losses'])), torch.mean(torch.FloatTensor(train_hist['G_model_mean_losses'])))) # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'D_model_state_dict': D_model.state_dict(), 'G_model_state_dict': G_model.state_dict(), 'D_optim_dict': D_optimizer.state_dict(), 'G_optim_dict': G_optimizer.state_dict() }, is_best=False, checkpoint=dataset + '_results/') #save test pictures after every epoch: p = dataset + '_results/result_epoch_' + str(epoch + 1) + '.png' utils.show_result(param_cuda, G_model, (epoch + 1), p, save=True) # add epoch time to the training history train_hist['per_epoch_ptimes'].append(per_epoch_ptime) end_time = time.time() total_ptime = end_time - start_time train_hist['total_ptime'].append(total_ptime) print("Avg per epoch ptime: %.2f, total %d epochs ptime: %.2f" % (torch.mean(torch.FloatTensor( train_hist['per_epoch_ptimes'])), train_epoch, total_ptime)) print("Training finish!... save learned parameters") # plot training history utils.show_train_hist(train_hist, save=True, path=dataset + '_results/_train_hist.png')
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, epochs, restore_file=None): total_batch_loss = [] val_losses = [] start_epoch = 0 best_val_loss = float('inf') # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(checkpoint_dir, restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) checkpoint = utils.load_checkpoint(restore_path, model, optimizer) start_epoch = checkpoint['epoch'] best_val_loss = checkpoint.get('best_loss',float('inf')) for epoch in range(epochs + start_epoch): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, epochs)) # compute number of batches in one epoch (one full pass over the training set) batch_loss, best_temp = train(model, optimizer, loss_fn, train_dataloader, val_dataloader, epoch, best_val_loss) best_val_loss = best_temp total_batch_loss += batch_loss # Evaluate for one epoch on validation set logging.info("- Training average loss : " + str(val_MSE)) # Evaluate MSE for one epoch on train and validation set train_MSE = evaluate(model, nn.MSELoss(), train_dataloader, device, dtype) val_MSE = evaluate(model, nn.MSELoss(), val_dataloader, device, dtype) # Evaluate L1 for one epoch on train and validation set train_L1 = evaluate(model, nn.L1Loss(), train_dataloader, device, dtype) val_L1 = evaluate(model, nn.L1Loss(), val_dataloader, device, dtype) # save training history in csv file: utils.save_history(epoch, train_MSE, val_MSE, train_L1, val_L1, results_dir) # print losses logging.info("- Train average MSE loss: " + str(train_MSE)) logging.info("- Validation average MSE loss: " + str(val_MSE)) val_losses.append(val_MSE) logging.info("- Validation average loss : " + str(val_MSE)) is_best = val_MSE <= best_val_loss # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=checkpoint_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_loss = val_MSE # Save best val loss in a text file in the checkpoint directory best_val_path = "val_loss.txt" utils.save_dict_to_txt(val_MSE, results_dir, best_val_path, epoch) utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict(), 'best_loss' : val_MSE}, is_best=is_best, checkpoint=checkpoint_dir) ## plots of losses if epoch !=0 or restore_file is not None: epoch_train_losses = np.load(os.path.join(results_dir, "epoch_avg_trainloss.npy")) np.save(os.path.join(results_dir,"epoch_avg_trainloss"), epoch_train_losses) np.save(os.path.join(results_dir, "epoch_val_loss"), val_losses) utils.show_train_hist(total_batch_loss, results_dir, show=False, epoch_plot=False, save=True)