def get_training_annotation(training_filepath, output_filepath, verbose=False): guess_num_lines = 1e6 read_interval = 10000000 num_joints = 21 print_verbose("Training input file path: " + training_filepath, verbose) print_verbose("Testing if program can write to output: " + output_filepath, verbose) with open(output_filepath, 'wb') as f: pickle.dump([], f) joints = [] image_names = [] with open(training_filepath, 'r') as f: line = f.readline() curr_line_ix = 1 tot_toc = 0 while line: start = time.time() image_names.append(training_file_line_to_image_name(line)) joints.append(training_file_line_to_numpy_array(line, num_joints)) if curr_line_ix % read_interval == 0: with open(output_filepath + '.pkl', 'wb') as pf: pickle.dump([image_names, joints], pf) line = f.readline() curr_line_ix += 1 tot_toc = display_est_time_loop(tot_toc + (time.time() - start), curr_line_ix, guess_num_lines, prefix='Line: ' + str(curr_line_ix) + ' ') with open(output_filepath, 'wb') as pf: pickle.dump([image_names, joints], pf)
def get_quant_results(model, valid_loader, results_filename='test_quant_results.p'): losses = [] pixel_losses = [] pixel_losses_sample = [] curr_iter = 1 iter_size = int(BATCH_SIZE / MAX_MEM_BATCH_SIZE) total_loss = 0 curr_train_ix = 0 tot_iter = min( MAX_N_VALID_BATCHES, int(len(valid_loader) / int(BATCH_SIZE / MAX_MEM_BATCH_SIZE))) tot_toc = 0 total_pixel_loss = 0 total_pixel_loss_sample = 0 results_dict = {} for batch_idx, (data, target) in enumerate(valid_loader): if curr_iter > MAX_N_VALID_BATCHES: break # start time counter start = time.time() curr_train_ix += 1 # get data and targetas cuda variables data, target = Variable(data).cuda(), Variable(target).cuda() # get model output output = model(data.cuda()) # accumulate loss for sub-mini-batch loss = my_losses.calculate_loss(output, target, iter_size, model.WEIGHT_LOSS_INTERMED1, model.WEIGHT_LOSS_INTERMED2, model.WEIGHT_LOSS_INTERMED3, model.WEIGHT_LOSS_MAIN) loss.backward() total_loss += loss # accumulate pixel dist loss for sub-mini-batch total_pixel_loss = my_losses.accumulate_pixel_dist_loss( total_pixel_loss, output[-1], target, BATCH_SIZE) total_pixel_loss_sample = my_losses.accumulate_pixel_dist_loss_from_sample( total_pixel_loss_sample, output[-1], target, BATCH_SIZE) # get boolean variable stating whether a mini-batch has been completed minibatch_completed = curr_train_ix % int( BATCH_SIZE / MAX_MEM_BATCH_SIZE) == 0 if minibatch_completed: # append loss losses.append(total_loss.data[0]) # erase loss total_loss = 0 # append dist loss pixel_losses.append(total_pixel_loss) # erase pixel dist loss total_pixel_loss = 0 # append dist loss from sample pixel_losses_sample.append(total_pixel_loss_sample) # erase pixel dist loss from sample total_pixel_loss_sample = 0 if curr_iter % LOG_INTERVAL == 0: if DEBUGGING_VISUALLY: print("\nPixel loss: " + str(pixel_losses[-1])) for idx in range(target.data.cpu().numpy().shape[0]): debugger.show_target_and_output_to_image_info( data, target, output, idx) # check if dist loss is better print("\nValidation set mean error (loss): " + str(np.mean(losses))) print("Validation set stddev error (loss): " + str(np.std(losses))) print("Validation set mean error (pixel loss): " + str(np.mean(pixel_losses))) print("Validation set stddev error (pixel loss): " + str(np.std(pixel_losses))) print( "Validation set mean error (pixel loss from sample of output): " + str(np.mean(pixel_losses_sample))) print( "Validation set stddev error (pixel loss from sample of output): " + str(np.std(pixel_losses_sample))) print("Saving validation results in file: " + results_filename) results_dict = { 'losses': losses, 'pixel_losses': pixel_losses, 'pixel_losses_sample': pixel_losses_sample, } if not results_filename == '': with open(results_filename, 'wb') as handle: pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) tot_toc = display_est_time_loop( tot_toc + time.time() - start, curr_iter, tot_iter, prefix='Validation: ' + 'Iter #' + str(curr_iter) + "/" + str(tot_iter) + ' - show info every ' + str(LOG_INTERVAL) + ' iter): ') curr_iter += 1 if not results_filename == '': with open(results_filename, 'wb') as handle: pickle.dump(results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) return losses, pixel_losses, pixel_losses_sample
def train(train_loader, model, optimizer, train_vars, control_vars, verbose=True): curr_epoch_iter = 1 for batch_idx, (data, target) in enumerate(train_loader): control_vars['batch_idx'] = batch_idx if batch_idx < control_vars['iter_size']: print_verbose("\rPerforming first iteration; current mini-batch: " + str(batch_idx+1) + "/" + str(control_vars['iter_size']), verbose, n_tabs=0, erase_line=True) # check if arrived at iter to start if control_vars['curr_epoch_iter'] < control_vars['start_iter_mod']: if batch_idx % control_vars['iter_size'] == 0: print_verbose("\rGoing through iterations to arrive at last one saved... " + str(int(control_vars['curr_epoch_iter']*100.0/control_vars['start_iter_mod'])) + "% of " + str(control_vars['start_iter_mod']) + " iterations (" + str(control_vars['curr_epoch_iter']) + "/" + str(control_vars['start_iter_mod']) + ")", verbose, n_tabs=0, erase_line=True) control_vars['curr_epoch_iter'] += 1 control_vars['curr_iter'] += 1 curr_epoch_iter += 1 continue # save checkpoint after final iteration if control_vars['curr_iter'] == control_vars['num_iter']: print_verbose("\nReached final number of iterations: " + str(control_vars['num_iter']), verbose) print_verbose("\tSaving final model checkpoint...", verbose) final_model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': train_vars, } trainer.save_checkpoint(final_model_dict, filename=train_vars['checkpoint_filenamebase'] + 'final' + str(control_vars['num_iter']) + '.pth.tar') control_vars['done_training'] = True break # start time counter start = time.time() # get data and targetas cuda variables target_heatmaps, target_joints, _, target_prior = target data, target_heatmaps, target_prior = Variable(data), Variable(target_heatmaps), Variable(target_prior) if train_vars['use_cuda']: data = data.cuda() target_heatmaps = target_heatmaps.cuda() target_prior = target_prior.cuda() # visualize if debugging # get model output output = model(data) # accumulate loss for sub-mini-batch if train_vars['cross_entropy']: loss_func = my_losses.cross_entropy_loss_p_logq else: loss_func = my_losses.euclidean_loss loss, loss_prior = my_losses.calculate_loss_HALNet_prior(loss_func, output, target_heatmaps, target_prior, model.joint_ixs, model.WEIGHT_LOSS_INTERMED1, model.WEIGHT_LOSS_INTERMED2, model.WEIGHT_LOSS_INTERMED3, model.WEIGHT_LOSS_MAIN, control_vars['iter_size']) loss.backward() train_vars['total_loss'] += loss train_vars['total_loss_prior'] += loss_prior # accumulate pixel dist loss for sub-mini-batch train_vars['total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( train_vars['total_pixel_loss'], output[3], target_heatmaps, control_vars['batch_size']) if train_vars['cross_entropy']: train_vars['total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( train_vars['total_pixel_loss_sample'], output[3], target_heatmaps, control_vars['batch_size']) else: train_vars['total_pixel_loss_sample'] = [-1] * len(model.joint_ixs) # get boolean variable stating whether a mini-batch has been completed minibatch_completed = (batch_idx+1) % control_vars['iter_size'] == 0 if minibatch_completed: # optimise for mini-batch optimizer.step() # clear optimiser optimizer.zero_grad() # append total loss train_vars['losses'].append(train_vars['total_loss'].data[0]) # erase total loss total_loss = train_vars['total_loss'].data[0] train_vars['total_loss'] = 0 # append total loss prior train_vars['losses_prior'].append(train_vars['total_loss_prior'].data[0]) # erase total loss total_loss_prior = train_vars['total_loss_prior'].data[0] train_vars['total_loss_prior'] = 0 # append dist loss train_vars['pixel_losses'].append(train_vars['total_pixel_loss']) # erase pixel dist loss train_vars['total_pixel_loss'] = [0] * len(model.joint_ixs) # append dist loss of sample from output train_vars['pixel_losses_sample'].append(train_vars['total_pixel_loss_sample']) # erase dist loss of sample from output train_vars['total_pixel_loss_sample'] = [0] * len(model.joint_ixs) # check if loss is better if train_vars['losses'][-1] < train_vars['best_loss']: train_vars['best_loss'] = train_vars['losses'][-1] print_verbose(" This is a best loss found so far: " + str(train_vars['losses'][-1]), verbose) train_vars['best_model_dict'] = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': train_vars, } if train_vars['losses_prior'][-1] < train_vars['best_loss_prior']: train_vars['best_loss_prior'] = train_vars['losses_prior'][-1] # log checkpoint if control_vars['curr_iter'] % control_vars['log_interval'] == 0: trainer.print_log_info(model, optimizer, epoch, total_loss, train_vars, control_vars) msg = '' msg += print_verbose( "-------------------------------------------------------------------------------------------", verbose) + "\n" msg += print_verbose("Current loss (prior): " + str(total_loss_prior), verbose) + "\n" msg += print_verbose("Best loss (prior): " + str(train_vars['best_loss_prior']), verbose) + "\n" msg += print_verbose("Mean total loss (prior): " + str(np.mean(train_vars['losses_prior'])), verbose) + "\n" msg += print_verbose("Mean loss (prior) for last " + str(control_vars['log_interval']) + " iterations (average total loss): " + str( np.mean(train_vars['losses_prior'][-control_vars['log_interval']:])), verbose) + "\n" msg += print_verbose( "-------------------------------------------------------------------------------------------", verbose) + "\n" if not control_vars['output_filepath'] == '': with open(control_vars['output_filepath'], 'a') as f: f.write(msg + '\n') if control_vars['curr_iter'] % control_vars['log_interval_valid'] == 0: print_verbose("\nSaving model and checkpoint model for validation", verbose) checkpoint_model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': train_vars, } trainer.save_checkpoint(checkpoint_model_dict, filename=train_vars['checkpoint_filenamebase'] + 'for_valid_' + str(control_vars['curr_iter']) + '.pth.tar') # print time lapse prefix = 'Training (Epoch #' + str(epoch) + ' ' + str(control_vars['curr_epoch_iter']) + '/' +\ str(control_vars['tot_iter']) + ')' + ', (Batch ' + str(control_vars['batch_idx']+1) +\ '(' + str(control_vars['iter_size']) + ')' + '/' +\ str(control_vars['num_batches']) + ')' + ', (Iter #' + str(control_vars['curr_iter']) +\ '(' + str(control_vars['batch_size']) + ')' +\ ' - log every ' + str(control_vars['log_interval']) + ' iter): ' control_vars['tot_toc'] = display_est_time_loop(control_vars['tot_toc'] + time.time() - start, control_vars['curr_iter'], control_vars['num_iter'], prefix=prefix) control_vars['curr_iter'] += 1 control_vars['start_iter'] = control_vars['curr_iter'] + 1 control_vars['curr_epoch_iter'] += 1 return train_vars, control_vars
def validate(valid_loader, model, optimizer, valid_vars, control_vars, verbose=True): curr_epoch_iter = 1 for batch_idx, (data, target) in enumerate(valid_loader): control_vars['batch_idx'] = batch_idx if batch_idx < control_vars['iter_size']: print_verbose("\rPerforming first iteration; current mini-batch: " + str(batch_idx + 1) + "/" + str(control_vars['iter_size']), verbose, n_tabs=0, erase_line=True) # start time counter start = time.time() # get data and targetas cuda variables target_heatmaps, target_joints, target_joints_z = target data, target_heatmaps = Variable(data), Variable(target_heatmaps) if valid_vars['use_cuda']: data = data.cuda() target_heatmaps = target_heatmaps.cuda() # visualize if debugging # get model output output = model(data) # accumulate loss for sub-mini-batch if valid_vars['cross_entropy']: loss_func = my_losses.cross_entropy_loss_p_logq else: loss_func = my_losses.euclidean_loss loss = my_losses.calculate_loss_HALNet(loss_func, output, target_heatmaps, model.joint_ixs, model.WEIGHT_LOSS_INTERMED1, model.WEIGHT_LOSS_INTERMED2, model.WEIGHT_LOSS_INTERMED3, model.WEIGHT_LOSS_MAIN, control_vars['iter_size']) if DEBUG_VISUALLY: for i in range(control_vars['max_mem_batch']): filenamebase_idx = (batch_idx * control_vars['max_mem_batch']) + i filenamebase = valid_loader.dataset.get_filenamebase(filenamebase_idx) fig = visualize.create_fig() #visualize.plot_joints_from_heatmaps(output[3][i].data.numpy(), fig=fig, # title=filenamebase, data=data[i].data.numpy()) #visualize.plot_image_and_heatmap(output[3][i][8].data.numpy(), # data=data[i].data.numpy(), # title=filenamebase) #visualize.savefig('/home/paulo/' + filenamebase.replace('/', '_') + '_heatmap') labels_colorspace = conv.heatmaps_to_joints_colorspace(output[3][i].data.numpy()) data_crop, crop_coords, labels_heatmaps, labels_colorspace = \ converter.crop_image_get_labels(data[i].data.numpy(), labels_colorspace, range(21)) visualize.plot_image(data_crop, title=filenamebase, fig=fig) visualize.plot_joints_from_colorspace(labels_colorspace, title=filenamebase, fig=fig, data=data_crop) #visualize.savefig('/home/paulo/' + filenamebase.replace('/', '_') + '_crop') visualize.show() #loss.backward() valid_vars['total_loss'] += loss # accumulate pixel dist loss for sub-mini-batch valid_vars['total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( valid_vars['total_pixel_loss'], output[3], target_heatmaps, control_vars['batch_size']) if valid_vars['cross_entropy']: valid_vars['total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( valid_vars['total_pixel_loss_sample'], output[3], target_heatmaps, control_vars['batch_size']) else: valid_vars['total_pixel_loss_sample'] = [-1] * len(model.joint_ixs) # get boolean variable stating whether a mini-batch has been completed minibatch_completed = (batch_idx+1) % control_vars['iter_size'] == 0 if minibatch_completed: # append total loss valid_vars['losses'].append(valid_vars['total_loss'].item()) # erase total loss total_loss = valid_vars['total_loss'].item() valid_vars['total_loss'] = 0 # append dist loss valid_vars['pixel_losses'].append(valid_vars['total_pixel_loss']) # erase pixel dist loss valid_vars['total_pixel_loss'] = [0] * len(model.joint_ixs) # append dist loss of sample from output valid_vars['pixel_losses_sample'].append(valid_vars['total_pixel_loss_sample']) # erase dist loss of sample from output valid_vars['total_pixel_loss_sample'] = [0] * len(model.joint_ixs) # check if loss is better if valid_vars['losses'][-1] < valid_vars['best_loss']: valid_vars['best_loss'] = valid_vars['losses'][-1] #print_verbose(" This is a best loss found so far: " + str(valid_vars['losses'][-1]), verbose) # log checkpoint if control_vars['curr_iter'] % control_vars['log_interval'] == 0: trainer.print_log_info(model, optimizer, 1, total_loss, valid_vars, control_vars) model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': valid_vars, } trainer.save_checkpoint(model_dict, filename=valid_vars['checkpoint_filenamebase'] + str(control_vars['num_iter']) + '.pth.tar') # print time lapse prefix = 'Validating (Epoch #' + str(1) + ' ' + str(control_vars['curr_epoch_iter']) + '/' +\ str(control_vars['tot_iter']) + ')' + ', (Batch ' + str(control_vars['batch_idx']+1) +\ '(' + str(control_vars['iter_size']) + ')' + '/' +\ str(control_vars['num_batches']) + ')' + ', (Iter #' + str(control_vars['curr_iter']) +\ '(' + str(control_vars['batch_size']) + ')' +\ ' - log every ' + str(control_vars['log_interval']) + ' iter): ' control_vars['tot_toc'] = display_est_time_loop(control_vars['tot_toc'] + time.time() - start, control_vars['curr_iter'], control_vars['num_iter'], prefix=prefix) control_vars['curr_iter'] += 1 control_vars['start_iter'] = control_vars['curr_iter'] + 1 control_vars['curr_epoch_iter'] += 1 return valid_vars, control_vars
def train(train_loader, model, optimizer, train_vars): verbose = train_vars['verbose'] for batch_idx, (data, target) in enumerate(train_loader): train_vars['batch_idx'] = batch_idx # print info about performing first iter if batch_idx < train_vars['iter_size']: print_verbose( "\rPerforming first iteration; current mini-batch: " + str(batch_idx + 1) + "/" + str(train_vars['iter_size']), verbose, n_tabs=0, erase_line=True) # check if arrived at iter to start arrived_curr_iter, train_vars = run_until_curr_iter( batch_idx, train_vars) if not arrived_curr_iter: continue # save checkpoint after final iteration if train_vars['curr_iter'] - 1 == train_vars['num_iter']: train_vars = trainer.save_final_checkpoint(train_vars, model, optimizer) break # start time counter start = time.time() # get data and target as torch Variables _, target_joints, target_heatmaps, target_joints_z = target # make target joints be relative target_joints = target_joints[:, 3:] data, target_heatmaps = Variable(data), Variable(target_heatmaps) if train_vars['use_cuda']: data = data.cuda() target_heatmaps = target_heatmaps.cuda() target_joints = target_joints.cuda() target_joints_z = target_joints_z.cuda() # get model output output = model(data) # accumulate loss for sub-mini-batch if train_vars['cross_entropy']: loss_func = my_losses.cross_entropy_loss_p_logq else: loss_func = my_losses.euclidean_loss weights_heatmaps_loss, weights_joints_loss = get_loss_weights( train_vars['curr_iter']) loss, loss_heatmaps, loss_joints = my_losses.calculate_loss_JORNet( loss_func, output, target_heatmaps, target_joints, train_vars['joint_ixs'], weights_heatmaps_loss, weights_joints_loss, train_vars['iter_size']) loss.backward() train_vars['total_loss'] += loss.item() train_vars['total_joints_loss'] += loss_joints.item() train_vars['total_heatmaps_loss'] += loss_heatmaps.item() # accumulate pixel dist loss for sub-mini-batch train_vars[ 'total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( train_vars['total_pixel_loss'], output[3], target_heatmaps, train_vars['batch_size']) if train_vars['cross_entropy']: train_vars[ 'total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( train_vars['total_pixel_loss_sample'], output[3], target_heatmaps, train_vars['batch_size']) else: train_vars['total_pixel_loss_sample'] = [-1] * len(model.joint_ixs) ''' For debugging training for i in range(train_vars['max_mem_batch']): filenamebase_idx = (batch_idx * train_vars['max_mem_batch']) + i filenamebase = train_loader.dataset.get_filenamebase(filenamebase_idx) visualize.plot_joints_from_heatmaps(target_heatmaps[i].data.cpu().numpy(), title='GT joints: ' + filenamebase, data=data[i].data.cpu().numpy()) visualize.plot_joints_from_heatmaps(output[3][i].data.cpu().numpy(), title='Pred joints: ' + filenamebase, data=data[i].data.cpu().numpy()) visualize.plot_image_and_heatmap(output[3][i][4].data.numpy(), data=data[i].data.numpy(), title='Thumb tib heatmap: ' + filenamebase) visualize.show() ''' # get boolean variable stating whether a mini-batch has been completed minibatch_completed = (batch_idx + 1) % train_vars['iter_size'] == 0 if minibatch_completed: # visualize # ax, fig = visualize.plot_3D_joints(target_joints[0]) # visualize.plot_3D_joints(target_joints[1], ax=ax, fig=fig) if train_vars['curr_iter'] % train_vars['log_interval'] == 0: fig, ax = visualize.plot_3D_joints(target_joints[0]) visualize.savefig('joints_GT_' + str(train_vars['curr_iter']) + '.png') #visualize.plot_3D_joints(target_joints[1], fig=fig, ax=ax, color_root='C7') #visualize.plot_3D_joints(output[7].data.cpu().numpy()[0], fig=fig, ax=ax, color_root='C7') visualize.plot_3D_joints(output[7].data.cpu().numpy()[0]) visualize.savefig('joints_model_' + str(train_vars['curr_iter']) + '.png') #visualize.show() #visualize.savefig('joints_' + str(train_vars['curr_iter']) + '.png') # change learning rate to 0.01 after 45000 iterations optimizer = change_learning_rate(optimizer, 0.01, train_vars['curr_iter']) # optimise for mini-batch optimizer.step() # clear optimiser optimizer.zero_grad() # append total loss train_vars['losses'].append(train_vars['total_loss']) # erase total loss total_loss = train_vars['total_loss'] train_vars['total_loss'] = 0 # append total joints loss train_vars['losses_joints'].append(train_vars['total_joints_loss']) # erase total joints loss train_vars['total_joints_loss'] = 0 # append total joints loss train_vars['losses_heatmaps'].append( train_vars['total_heatmaps_loss']) # erase total joints loss train_vars['total_heatmaps_loss'] = 0 # append dist loss train_vars['pixel_losses'].append(train_vars['total_pixel_loss']) # erase pixel dist loss train_vars['total_pixel_loss'] = [0] * len(model.joint_ixs) # append dist loss of sample from output train_vars['pixel_losses_sample'].append( train_vars['total_pixel_loss_sample']) # erase dist loss of sample from output train_vars['total_pixel_loss_sample'] = [0] * len(model.joint_ixs) # check if loss is better if train_vars['losses'][-1] < train_vars['best_loss']: train_vars['best_loss'] = train_vars['losses'][-1] print_verbose( " This is a best loss found so far: " + str(train_vars['losses'][-1]), verbose) train_vars['best_model_dict'] = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_vars': train_vars } # log checkpoint if train_vars['curr_iter'] % train_vars['log_interval'] == 0: trainer.print_log_info(model, optimizer, epoch, total_loss, train_vars, train_vars) aa1 = target_joints[0].data.cpu().numpy() aa2 = output[7][0].data.cpu().numpy() output_joint_loss = np.sum(np.abs(aa1 - aa2)) / 63 msg = '' msg += print_verbose( "-------------------------------------------------------------------------------------------", verbose) + "\n" msg += print_verbose( '\tJoint Coord Avg Loss for first image of current mini-batch: ' + str(output_joint_loss) + '\n', train_vars['verbose']) msg += print_verbose( "-------------------------------------------------------------------------------------------", verbose) + "\n" if not train_vars['output_filepath'] == '': with open(train_vars['output_filepath'], 'a') as f: f.write(msg + '\n') if train_vars['curr_iter'] % train_vars['log_interval_valid'] == 0: print_verbose( "\nSaving model and checkpoint model for validation", verbose) checkpoint_model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_vars': train_vars, } trainer.save_checkpoint( checkpoint_model_dict, filename=train_vars['checkpoint_filenamebase'] + 'for_valid_' + str(train_vars['curr_iter']) + '.pth.tar') # print time lapse prefix = 'Training (Epoch #' + str(epoch) + ' ' + str(train_vars['curr_epoch_iter']) + '/' +\ str(train_vars['tot_iter']) + ')' + ', (Batch ' + str(train_vars['batch_idx']+1) +\ '(' + str(train_vars['iter_size']) + ')' + '/' +\ str(train_vars['num_batches']) + ')' + ', (Iter #' + str(train_vars['curr_iter']) +\ '(' + str(train_vars['batch_size']) + ')' +\ ' - log every ' + str(train_vars['log_interval']) + ' iter): ' train_vars['tot_toc'] = display_est_time_loop( train_vars['tot_toc'] + time.time() - start, train_vars['curr_iter'], train_vars['num_iter'], prefix=prefix) train_vars['curr_iter'] += 1 train_vars['start_iter'] = train_vars['curr_iter'] + 1 train_vars['curr_epoch_iter'] += 1 return train_vars
def validate(valid_loader, model, optimizer, valid_vars, control_vars, verbose=True): curr_epoch_iter = 1 for batch_idx, (data, target) in enumerate(valid_loader): control_vars['batch_idx'] = batch_idx if batch_idx < control_vars['iter_size']: print_verbose( "\rPerforming first iteration; current mini-batch: " + str(batch_idx + 1) + "/" + str(control_vars['iter_size']), verbose, n_tabs=0, erase_line=True) # start time counter start = time.time() # get data and targetas cuda variables target_heatmaps, target_joints, target_handroot = target # make target joints be relative target_joints = target_joints[:, 3:] data, target_heatmaps = Variable(data), Variable(target_heatmaps) if valid_vars['use_cuda']: data = data.cuda() target_joints = target_joints.cuda() target_heatmaps = target_heatmaps.cuda() target_handroot = target_handroot.cuda() # visualize if debugging # get model output output = model(data) # accumulate loss for sub-mini-batch if model.cross_entropy: loss_func = my_losses.cross_entropy_loss_p_logq else: loss_func = my_losses.euclidean_loss weights_heatmaps_loss, weights_joints_loss = get_loss_weights( control_vars['curr_iter']) loss, loss_heatmaps, loss_joints = my_losses.calculate_loss_JORNet( loss_func, output, target_heatmaps, target_joints, valid_vars['joint_ixs'], weights_heatmaps_loss, weights_joints_loss, control_vars['iter_size']) valid_vars['total_loss'] += loss valid_vars['total_joints_loss'] += loss_joints valid_vars['total_heatmaps_loss'] += loss_heatmaps # accumulate pixel dist loss for sub-mini-batch valid_vars[ 'total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( valid_vars['total_pixel_loss'], output[3], target_heatmaps, control_vars['batch_size']) valid_vars[ 'total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( valid_vars['total_pixel_loss_sample'], output[3], target_heatmaps, control_vars['batch_size']) # get boolean variable stating whether a mini-batch has been completed for i in range(control_vars['max_mem_batch']): filenamebase_idx = (batch_idx * control_vars['max_mem_batch']) + i filenamebase = valid_loader.dataset.get_filenamebase( filenamebase_idx) print('') print(filenamebase) visualize.plot_image(data[i].data.numpy()) visualize.show() output_batch_numpy = output[7][i].data.cpu().numpy() print('\n-------------------------------') reshaped_out = output_batch_numpy.reshape((20, 3)) for j in range(20): print('[{}, {}, {}],'.format(reshaped_out[j, 0], reshaped_out[j, 1], reshaped_out[j, 2])) print('-------------------------------') fig, ax = visualize.plot_3D_joints(target_joints[i]) visualize.plot_3D_joints(output_batch_numpy, fig=fig, ax=ax, color='C6') visualize.title(filenamebase) visualize.show() temp = np.zeros((21, 3)) output_batch_numpy_abs = output_batch_numpy.reshape((20, 3)) temp[1:, :] = output_batch_numpy_abs output_batch_numpy_abs = temp output_joints_colorspace = camera.joints_depth2color( output_batch_numpy_abs, depth_intr_matrix=synthhands_handler.DEPTH_INTR_MTX, handroot=target_handroot[i].data.cpu().numpy()) visualize.plot_3D_joints(output_joints_colorspace) visualize.show() aa1 = target_joints[i].data.cpu().numpy().reshape((20, 3)) aa2 = output[7][i].data.cpu().numpy().reshape((20, 3)) print('\n----------------------------------') print(np.sum(np.abs(aa1 - aa2)) / 60) print('----------------------------------') #loss.backward() valid_vars['total_loss'] += loss valid_vars['total_joints_loss'] += loss_joints valid_vars['total_heatmaps_loss'] += loss_heatmaps # accumulate pixel dist loss for sub-mini-batch valid_vars[ 'total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( valid_vars['total_pixel_loss'], output[3], target_heatmaps, control_vars['batch_size']) valid_vars[ 'total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( valid_vars['total_pixel_loss_sample'], output[3], target_heatmaps, control_vars['batch_size']) # get boolean variable stating whether a mini-batch has been completed minibatch_completed = (batch_idx + 1) % control_vars['iter_size'] == 0 if minibatch_completed: # append total loss valid_vars['losses'].append(valid_vars['total_loss'].data[0]) # erase total loss total_loss = valid_vars['total_loss'].data[0] valid_vars['total_loss'] = 0 # append total joints loss valid_vars['losses_joints'].append( valid_vars['total_joints_loss'].data[0]) # erase total joints loss valid_vars['total_joints_loss'] = 0 # append total joints loss valid_vars['losses_heatmaps'].append( valid_vars['total_heatmaps_loss'].data[0]) # erase total joints loss valid_vars['total_heatmaps_loss'] = 0 # append dist loss valid_vars['pixel_losses'].append(valid_vars['total_pixel_loss']) # erase pixel dist loss valid_vars['total_pixel_loss'] = [0] * len(model.joint_ixs) # append dist loss of sample from output valid_vars['pixel_losses_sample'].append( valid_vars['total_pixel_loss_sample']) # erase dist loss of sample from output valid_vars['total_pixel_loss_sample'] = [0] * len(model.joint_ixs) # check if loss is better #if valid_vars['losses'][-1] < valid_vars['best_loss']: # valid_vars['best_loss'] = valid_vars['losses'][-1] # print_verbose(" This is a best loss found so far: " + str(valid_vars['losses'][-1]), verbose) # log checkpoint if control_vars['curr_iter'] % control_vars['log_interval'] == 0: trainer.print_log_info(model, optimizer, 1, total_loss, valid_vars, control_vars) model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': valid_vars, } trainer.save_checkpoint( model_dict, filename=valid_vars['checkpoint_filenamebase'] + str(control_vars['num_iter']) + '.pth.tar') # print time lapse prefix = 'Validating (Epoch #' + str(1) + ' ' + str(control_vars['curr_epoch_iter']) + '/' +\ str(control_vars['tot_iter']) + ')' + ', (Batch ' + str(control_vars['batch_idx']+1) +\ '(' + str(control_vars['iter_size']) + ')' + '/' +\ str(control_vars['num_batches']) + ')' + ', (Iter #' + str(control_vars['curr_iter']) +\ '(' + str(control_vars['batch_size']) + ')' +\ ' - log every ' + str(control_vars['log_interval']) + ' iter): ' control_vars['tot_toc'] = display_est_time_loop( control_vars['tot_toc'] + time.time() - start, control_vars['curr_iter'], control_vars['num_iter'], prefix=prefix) control_vars['curr_iter'] += 1 control_vars['start_iter'] = control_vars['curr_iter'] + 1 control_vars['curr_epoch_iter'] += 1 return valid_vars, control_vars
def validate(valid_loader, model, optimizer, valid_vars, control_vars, verbose=True): losses_main = [] for batch_idx, (data, target) in enumerate(valid_loader): control_vars['batch_idx'] = batch_idx if batch_idx < control_vars['iter_size']: print_verbose( "\rPerforming first iteration; current mini-batch: " + str(batch_idx + 1) + "/" + str(control_vars['iter_size']), verbose, n_tabs=0, erase_line=True) # start time counter start = time.time() # get data and targetas cuda variables target_heatmaps, target_joints, target_handroot = target # make target joints be relative target_joints = target_joints[:, 3:] data, target_heatmaps = Variable(data), Variable(target_heatmaps) if valid_vars['use_cuda']: data = data.cuda() target_joints = target_joints.cuda() target_heatmaps = target_heatmaps.cuda() target_handroot = target_handroot.cuda() # visualize if debugging # get model output output = model(data) # accumulate loss for sub-mini-batch if model.cross_entropy: loss_func = my_losses.cross_entropy_loss_p_logq else: loss_func = my_losses.euclidean_loss weights_heatmaps_loss, weights_joints_loss = get_loss_weights( control_vars['curr_iter']) loss, loss_heatmaps, loss_joints, loss_main = my_losses.calculate_loss_JORNet_for_valid( loss_func, output, target_heatmaps, target_joints, valid_vars['joint_ixs'], weights_heatmaps_loss, weights_joints_loss, control_vars['iter_size']) losses_main.append(loss_main.item() / 63.0) valid_vars['total_loss'] += loss valid_vars['total_joints_loss'] += loss_joints valid_vars['total_heatmaps_loss'] += loss_heatmaps # accumulate pixel dist loss for sub-mini-batch valid_vars[ 'total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( valid_vars['total_pixel_loss'], output[3], target_heatmaps, control_vars['batch_size']) valid_vars[ 'total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( valid_vars['total_pixel_loss_sample'], output[3], target_heatmaps, control_vars['batch_size']) valid_vars['total_loss'] += loss valid_vars['total_joints_loss'] += loss_joints valid_vars['total_heatmaps_loss'] += loss_heatmaps # accumulate pixel dist loss for sub-mini-batch valid_vars[ 'total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( valid_vars['total_pixel_loss'], output[3], target_heatmaps, control_vars['batch_size']) valid_vars[ 'total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( valid_vars['total_pixel_loss_sample'], output[3], target_heatmaps, control_vars['batch_size']) # get boolean variable stating whether a mini-batch has been completed minibatch_completed = (batch_idx + 1) % control_vars['iter_size'] == 0 if minibatch_completed: # append total loss valid_vars['losses'].append(valid_vars['total_loss'].item()) # erase total loss total_loss = valid_vars['total_loss'].item() valid_vars['total_loss'] = 0 # append total joints loss valid_vars['losses_joints'].append( valid_vars['total_joints_loss'].item()) # erase total joints loss valid_vars['total_joints_loss'] = 0 # append total joints loss valid_vars['losses_heatmaps'].append( valid_vars['total_heatmaps_loss'].item()) # erase total joints loss valid_vars['total_heatmaps_loss'] = 0 # append dist loss valid_vars['pixel_losses'].append(valid_vars['total_pixel_loss']) # erase pixel dist loss valid_vars['total_pixel_loss'] = [0] * len(model.joint_ixs) # append dist loss of sample from output valid_vars['pixel_losses_sample'].append( valid_vars['total_pixel_loss_sample']) # erase dist loss of sample from output valid_vars['total_pixel_loss_sample'] = [0] * len(model.joint_ixs) # check if loss is better #if valid_vars['losses'][-1] < valid_vars['best_loss']: # valid_vars['best_loss'] = valid_vars['losses'][-1] # print_verbose(" This is a best loss found so far: " + str(valid_vars['losses'][-1]), verbose) # log checkpoint if control_vars['curr_iter'] % control_vars['log_interval'] == 0: trainer.print_log_info(model, optimizer, 1, total_loss, valid_vars, control_vars, save_best=False, save_a_checkpoint=False) model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': valid_vars, } #trainer.save_checkpoint(model_dict, # filename=valid_vars['checkpoint_filenamebase'] + # str(control_vars['num_iter']) + '.pth.tar') # print time lapse prefix = 'Validating (Epoch #' + str(1) + ' ' + str(control_vars['curr_epoch_iter']) + '/' +\ str(control_vars['tot_iter']) + ')' + ', (Batch ' + str(control_vars['batch_idx']+1) +\ '(' + str(control_vars['iter_size']) + ')' + '/' +\ str(control_vars['num_batches']) + ')' + ', (Iter #' + str(control_vars['curr_iter']) +\ '(' + str(control_vars['batch_size']) + ')' +\ ' - log every ' + str(control_vars['log_interval']) + ' iter): ' control_vars['tot_toc'] = display_est_time_loop( control_vars['tot_toc'] + time.time() - start, control_vars['curr_iter'], control_vars['num_iter'], prefix=prefix) control_vars['curr_iter'] += 1 control_vars['start_iter'] = control_vars['curr_iter'] + 1 control_vars['curr_epoch_iter'] += 1 total_avg_loss = np.mean(losses_main) return valid_vars, control_vars, total_avg_loss
def train(train_loader, model, optimizer, train_vars): verbose = train_vars['verbose'] for batch_idx, (data, target) in enumerate(train_loader): train_vars['batch_idx'] = batch_idx # print info about performing first iter if batch_idx < train_vars['iter_size']: print_verbose("\rPerforming first iteration; current mini-batch: " + str(batch_idx+1) + "/" + str(train_vars['iter_size']), verbose, n_tabs=0, erase_line=True) # check if arrived at iter to start arrived_curr_iter, train_vars = run_until_curr_iter(batch_idx, train_vars) if not arrived_curr_iter: continue # save checkpoint after final iteration if train_vars['curr_iter'] - 1 == train_vars['num_iter']: train_vars = save_final_checkpoint(train_vars, model, optimizer) break # start time counter start = time.time() # get data and target as torch Variables _, target_joints, target_heatmaps, target_joints_z = target data, target_heatmaps = Variable(data), Variable(target_heatmaps) if train_vars['use_cuda']: data = data.cuda() target_heatmaps = target_heatmaps.cuda() # get model output output = model(data) # accumulate loss for sub-mini-batch if model.cross_entropy: loss_func = my_losses.cross_entropy_loss_p_logq else: loss_func = my_losses.euclidean_loss loss = my_losses.calculate_loss_HALNet(loss_func, output, target_heatmaps, model.joint_ixs, model.WEIGHT_LOSS_INTERMED1, model.WEIGHT_LOSS_INTERMED2, model.WEIGHT_LOSS_INTERMED3, model.WEIGHT_LOSS_MAIN, train_vars['iter_size']) loss.backward() train_vars['total_loss'] += loss # accumulate pixel dist loss for sub-mini-batch train_vars['total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( train_vars['total_pixel_loss'], output[3], target_heatmaps, train_vars['batch_size']) if train_vars['cross_entropy']: train_vars['total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( train_vars['total_pixel_loss_sample'], output[3], target_heatmaps, train_vars['batch_size']) else: train_vars['total_pixel_loss_sample'] = [-1] * len(model.joint_ixs) # get boolean variable stating whether a mini-batch has been completed minibatch_completed = (batch_idx+1) % train_vars['iter_size'] == 0 if minibatch_completed: # optimise for mini-batch optimizer.step() # clear optimiser optimizer.zero_grad() # append total loss train_vars['losses'].append(train_vars['total_loss'].item()) # erase total loss total_loss = train_vars['total_loss'].item() train_vars['total_loss'] = 0 # append dist loss train_vars['pixel_losses'].append(train_vars['total_pixel_loss']) # erase pixel dist loss train_vars['total_pixel_loss'] = [0] * len(model.joint_ixs) # append dist loss of sample from output train_vars['pixel_losses_sample'].append(train_vars['total_pixel_loss_sample']) # erase dist loss of sample from output train_vars['total_pixel_loss_sample'] = [0] * len(model.joint_ixs) # check if loss is better if train_vars['losses'][-1] < train_vars['best_loss']: train_vars['best_loss'] = train_vars['losses'][-1] print_verbose(" This is a best loss found so far: " + str(train_vars['losses'][-1]), verbose) train_vars['best_model_dict'] = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_vars': train_vars } # log checkpoint if train_vars['curr_iter'] % train_vars['log_interval'] == 0: trainer.print_log_info(model, optimizer, epoch, total_loss, train_vars, train_vars) if train_vars['curr_iter'] % train_vars['log_interval_valid'] == 0: print_verbose("\nSaving model and checkpoint model for validation", verbose) checkpoint_model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_vars': train_vars, } trainer.save_checkpoint(checkpoint_model_dict, filename=train_vars['checkpoint_filenamebase'] + 'for_valid_' + str(train_vars['curr_iter']) + '.pth.tar') # print time lapse prefix = 'Training (Epoch #' + str(epoch) + ' ' + str(train_vars['curr_epoch_iter']) + '/' +\ str(train_vars['tot_iter']) + ')' + ', (Batch ' + str(train_vars['batch_idx']+1) +\ '(' + str(train_vars['iter_size']) + ')' + '/' +\ str(train_vars['num_batches']) + ')' + ', (Iter #' + str(train_vars['curr_iter']) +\ '(' + str(train_vars['batch_size']) + ')' +\ ' - log every ' + str(train_vars['log_interval']) + ' iter): ' train_vars['tot_toc'] = display_est_time_loop(train_vars['tot_toc'] + time.time() - start, train_vars['curr_iter'], train_vars['num_iter'], prefix=prefix) train_vars['curr_iter'] += 1 train_vars['start_iter'] = train_vars['curr_iter'] + 1 train_vars['curr_epoch_iter'] += 1 return train_vars
def train(train_loader, model, optimizer, train_vars, control_vars, verbose=True): curr_epoch_iter = 1 for batch_idx, (data, target) in enumerate(train_loader): control_vars['batch_idx'] = batch_idx if batch_idx < control_vars['iter_size']: print_verbose("\rPerforming first iteration; current mini-batch: " + str(batch_idx+1) + "/" + str(control_vars['iter_size']), verbose, n_tabs=0, erase_line=True) # check if arrived at iter to start if control_vars['curr_epoch_iter'] < control_vars['start_iter_mod']: control_vars['curr_epoch_iter'] = control_vars['start_iter_mod'] msg = '' if batch_idx % control_vars['iter_size'] == 0: msg += print_verbose("\rGoing through iterations to arrive at last one saved... " + str(int(control_vars['curr_epoch_iter']*100.0/control_vars['start_iter_mod'])) + "% of " + str(control_vars['start_iter_mod']) + " iterations (" + str(control_vars['curr_epoch_iter']) + "/" + str(control_vars['start_iter_mod']) + ")", verbose, n_tabs=0, erase_line=True) control_vars['curr_epoch_iter'] += 1 control_vars['curr_iter'] += 1 curr_epoch_iter += 1 if not control_vars['output_filepath'] == '': with open(control_vars['output_filepath'], 'a') as f: f.write(msg + '\n') continue # save checkpoint after final iteration if control_vars['curr_iter'] == control_vars['num_iter']: print_verbose("\nReached final number of iterations: " + str(control_vars['num_iter']), verbose) print_verbose("\tSaving final model checkpoint...", verbose) final_model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': train_vars, } trainer.save_checkpoint(final_model_dict, filename=train_vars['checkpoint_filenamebase'] + 'final' + str(control_vars['num_iter']) + '.pth.tar') control_vars['done_training'] = True break # start time counter start = time.time() # get data and targetas cuda variables target_heatmaps, target_joints, target_roothand = target data, target_heatmaps, target_joints, target_roothand = Variable(data), Variable(target_heatmaps),\ Variable(target_joints), Variable(target_roothand) if train_vars['use_cuda']: data = data.cuda() target_heatmaps = target_heatmaps.cuda() target_joints = target_joints.cuda() # get model output output = model(data) ''' visualize.plot_joints_from_heatmaps(target_heatmaps[0, :, :, :].cpu().data.numpy(), title='', data=data[0].cpu().data.numpy()) visualize.show() visualize.plot_image_and_heatmap(target_heatmaps[0][4].cpu().data.numpy(), data=data[0].cpu().data.numpy(), title='') visualize.show() visualize.plot_image_and_heatmap(output[3][0][4].cpu().data.numpy(), data=data[0].cpu().data.numpy(), title='') visualize.show() ''' # accumulate loss for sub-mini-batch if train_vars['cross_entropy']: loss_func = my_losses.cross_entropy_loss_p_logq else: loss_func = my_losses.euclidean_loss weights_heatmaps_loss, weights_joints_loss = get_loss_weights(control_vars['curr_iter']) loss, loss_heatmaps, loss_joints = my_losses.calculate_loss_JORNet( loss_func, output, target_heatmaps, target_joints, train_vars['joint_ixs'], weights_heatmaps_loss, weights_joints_loss, control_vars['iter_size']) loss.backward() train_vars['total_loss'] += loss.data[0] train_vars['total_joints_loss'] += loss_joints.data[0] train_vars['total_heatmaps_loss'] += loss_heatmaps.data[0] # accumulate pixel dist loss for sub-mini-batch train_vars['total_pixel_loss'] = my_losses.accumulate_pixel_dist_loss_multiple( train_vars['total_pixel_loss'], output[3], target_heatmaps, control_vars['batch_size']) train_vars['total_pixel_loss_sample'] = my_losses.accumulate_pixel_dist_loss_from_sample_multiple( train_vars['total_pixel_loss_sample'], output[3], target_heatmaps, control_vars['batch_size']) # get boolean variable stating whether a mini-batch has been completed minibatch_completed = (batch_idx+1) % control_vars['iter_size'] == 0 if minibatch_completed: # optimise for mini-batch optimizer.step() # clear optimiser optimizer.zero_grad() # append total loss train_vars['losses'].append(train_vars['total_loss']) # erase total loss total_loss = train_vars['total_loss'] train_vars['total_loss'] = 0 # append total joints loss train_vars['losses_joints'].append(train_vars['total_joints_loss']) # erase total joints loss train_vars['total_joints_loss'] = 0 # append total joints loss train_vars['losses_heatmaps'].append(train_vars['total_heatmaps_loss']) # erase total joints loss train_vars['total_heatmaps_loss'] = 0 # append dist loss train_vars['pixel_losses'].append(train_vars['total_pixel_loss']) # erase pixel dist loss train_vars['total_pixel_loss'] = [0] * len(model.joint_ixs) # append dist loss of sample from output train_vars['pixel_losses_sample'].append(train_vars['total_pixel_loss_sample']) # erase dist loss of sample from output train_vars['total_pixel_loss_sample'] = [0] * len(model.joint_ixs) # check if loss is better if train_vars['losses'][-1] < train_vars['best_loss']: train_vars['best_loss'] = train_vars['losses'][-1] print_verbose(" This is a best loss found so far: " + str(train_vars['losses'][-1]), verbose) train_vars['best_model_dict'] = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': train_vars, } if train_vars['losses_joints'][-1] < train_vars['best_loss_joints']: train_vars['best_loss_joints'] = train_vars['losses_joints'][-1] if train_vars['losses_heatmaps'][-1] < train_vars['best_loss_heatmaps']: train_vars['best_loss_heatmaps'] = train_vars['losses_heatmaps'][-1] # log checkpoint if control_vars['curr_iter'] % control_vars['log_interval'] == 0: trainer.print_log_info(model, optimizer, epoch, total_loss, train_vars, control_vars) aa1 = target_joints[0].data.cpu().numpy() aa2 = output[7][0].data.cpu().numpy() output_joint_loss = np.sum(np.abs(aa1 - aa2)) / 63 msg = '' msg += print_verbose( "-------------------------------------------------------------------------------------------", verbose) + "\n" msg += print_verbose('\tJoint Coord Avg Loss for first image of current mini-batch: ' + str(output_joint_loss) + '\n', control_vars['verbose']) msg += print_verbose( "-------------------------------------------------------------------------------------------", verbose) + "\n" if not control_vars['output_filepath'] == '': with open(control_vars['output_filepath'], 'a') as f: f.write(msg + '\n') if control_vars['curr_iter'] % control_vars['log_interval_valid'] == 0: print_verbose("\nSaving model and checkpoint model for validation", verbose) checkpoint_model_dict = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'control_vars': control_vars, 'train_vars': train_vars, } trainer.save_checkpoint(checkpoint_model_dict, filename=train_vars['checkpoint_filenamebase'] + 'for_valid_' + str(control_vars['curr_iter']) + '.pth.tar') # print time lapse prefix = 'Training (Epoch #' + str(epoch) + ' ' + str(control_vars['curr_epoch_iter']) + '/' +\ str(control_vars['tot_iter']) + ')' + ', (Batch ' + str(control_vars['batch_idx']+1) +\ '(' + str(control_vars['iter_size']) + ')' + '/' +\ str(control_vars['num_batches']) + ')' + ', (Iter #' + str(control_vars['curr_iter']) +\ '(' + str(control_vars['batch_size']) + ')' +\ ' - log every ' + str(control_vars['log_interval']) + ' iter): ' control_vars['tot_toc'] = display_est_time_loop(control_vars['tot_toc'] + time.time() - start, control_vars['curr_iter'], control_vars['num_iter'], prefix=prefix) control_vars['curr_iter'] += 1 control_vars['start_iter'] = control_vars['curr_iter'] + 1 control_vars['curr_epoch_iter'] += 1 return train_vars, control_vars