def train(args, epoch, start_iteration, data_loader, model, optimizer, loss, logger, is_validate=False, offset=0): statistics = [] total_loss = 0 gpu_mem = tools.gpumemusage() if is_validate: model.eval() title = 'Validating {} Epoch {}'.format(gpu_mem, epoch) args.validation_n_batches = np.inf if args.validation_n_batches < 0 else args.validation_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=100, total=np.minimum(len(data_loader), args.validation_n_batches), leave=True, position=offset, desc=title) else: model.train() title = 'Training {} Epoch {}'.format(tools.gpumemusage(), epoch) args.train_n_batches = np.inf if args.train_n_batches < 0 else args.train_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=120, total=np.minimum(len(data_loader), args.train_n_batches), smoothing=.9, miniters=1, leave=True, position=offset, desc=title) last_log_time = progress._time() for batch_idx, (data, target) in enumerate(progress): data, target = [Variable(d, volatile=is_validate) for d in data], [ Variable(t, volatile=is_validate) for t in target ] if args.cuda: data, target = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target] optimizer.zero_grad() if not is_validate else None output = model(data[0]) loss_labels, loss_values = loss(output, target[0]) loss_val = loss_values[0] total_loss += loss_val.data[0] loss_values = [v.data[0] for v in loss_values] assert not np.isnan(total_loss) if not is_validate and args.fp16: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as( params[i]).detach() param_copy[i].grad.mul_(1. / args.loss_scale) optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) optimizer.step() # Update hyperparameters if needed global_iteration = start_iteration + batch_idx if not is_validate: tools.update_hyperparameter_schedule(args, epoch, global_iteration, optimizer) loss_labels.append('lr') loss_values.append(optimizer.param_groups[0]['lr']) loss_labels.append('load') loss_values.append(progress.iterable.last_duration) # Print out statistics statistics.append(loss_values) title = '{} {} Epoch {}'.format( 'Validating' if is_validate else 'Training', tools.gpumemusage(), epoch) progress.set_description( title + ' ' + tools.format_dictionary_of_losses(loss_labels, statistics[-1])) if ((((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or (is_validate and batch_idx == args.validation_n_batches - 1)): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar( 'batch logs per second', len(statistics) / (progress._time() - last_log_time), global_iteration) last_log_time = progress._time() all_losses = np.array(statistics) for i, key in enumerate(loss_labels): logger.add_scalar('average batch ' + key, all_losses[:, i].mean(), global_iteration) logger.add_histogram(key, all_losses[:, i], global_iteration) # Reset Summary statistics = [] if (is_validate and (batch_idx == args.validation_n_batches)): break if ((not is_validate) and (batch_idx == (args.train_n_batches))): break progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)
def inference(args, epoch, data_loader, model, loss, offset=0): model.eval() if args.save_flow or args.render_validation: flow_folder = "{}/{}.epoch-{}-flow-field".format( args.inference_dir, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_folder): os.makedirs(flow_folder) gpu_mem = tools.gpumemusage() args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches progress = tqdm(data_loader, ncols=100, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing %s' % (gpu_mem), leave=True, position=offset) statistics = [] total_loss = 0 for batch_idx, (data, target) in enumerate(progress): if args.cuda: data, target = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target] data, target = [Variable(d, volatile=True) for d in data ], [Variable(t, volatile=True) for t in target] output = [model(data[0])] if len(target) == 0: target = [output[0]] loss_labels, loss_values = loss(output[0], target[0]) loss_val = loss_values[0] total_loss += loss_val.data[0] statistics.append([v.data[0] for v in loss_values]) _pflow = output[0].data.cpu().numpy().transpose(0, 2, 3, 1) if args.save_flow or args.render_validation: for i in range(args.inference_batch_size): flow_utils.writeFlow( join( flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)), _pflow[i]) progress.set_description('Inference {} Averages for Epoch {}: '. format(tools.gpumemusage(), epoch) + tools.format_dictionary_of_losses( loss_labels, np.array(statistics).mean(axis=0))) progress.update(1) if batch_idx == (args.inference_n_batches - 1): break progress.close() return
def inference(args, epoch, data_loader, model, offset=0): model.eval() if args.save_flow or args.render_validation: flow_folder = "{}/{}.epoch-{}-flow-field".format( args.inference_dir, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_folder): os.makedirs(flow_folder) args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches progress = tqdm(data_loader, ncols=100, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing ', leave=True, position=offset) statistics = [] total_loss = 0 for batch_idx, (data, target) in enumerate(progress): if args.cuda: data, target = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target] data, target = [Variable(d, volatile=True) for d in data ], [Variable(t, volatile=True) for t in target] # when ground-truth flows are not available for inference_dataset, # the targets are set to all zeros. thus, losses are actually L1 or L2 norms of compute optical flows, # depending on the type of loss norm passed in losses, output = model(data[0], target[0], inference=True) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.data[0] loss_values = [v.data[0] for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) statistics.append(loss_values) # import IPython; IPython.embed() if args.save_flow or args.render_validation: for i in range(args.inference_batch_size): _pflow = output[i].data.cpu().numpy().transpose(1, 2, 0) flow_utils.writeFlow( join( flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)), _pflow) progress.set_description( 'Inference Averages for Epoch {}: '.format(epoch) + tools.format_dictionary_of_losses( loss_labels, np.array(statistics).mean(axis=0))) progress.update(1) if batch_idx == (args.inference_n_batches - 1): break progress.close() return
def inference(args, epoch, data_loader, model, offset=0): model.eval() if args.save_flow or args.render_validation: flow_folder = "{}/inference/{}.epoch-{}-flow-field".format(args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_folder): os.makedirs(flow_folder) # visualization folder if args.inference_visualize: flow_vis_folder = "{}/inference/{}.epoch-{}-flow-vis".format(args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_vis_folder): os.makedirs(flow_vis_folder) args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches progress = tqdm(data_loader, ncols=100, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing ', leave=True, position=offset) statistics = [] total_loss = 0 for batch_idx, (data, target, city_name, video_name, image_list) in enumerate(progress): city_name = city_name[0] video_name = video_name[0] #print('cur ',image_list[0][0],' ', image_list[1][0]) #print('city name = ', city_name) # print('video name = ', video_name) #if batch_idx == 0: # name = [] # for i in filename: # name.append(i[0]) #print('st: ',i) #print('name = ', name) #if name[batch_idx] == '': # continue if args.cuda: data, target = [d.cuda(non_blocking=True) for d in data], [t.cuda(non_blocking=True) for t in target] data, target = [Variable(d) for d in data], [ Variable(t) for t in target] # when ground-truth flows are not available for inference_dataset, # the targets are set to all zeros. thus, losses are actually L1 or L2 norms of compute optical flows, # depending on the type of loss norm passed in with torch.no_grad(): losses, output = model(data[0], target[0], inference=True) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) statistics.append(loss_values) # import IPython; IPython.embed() if args.save_flow or args.render_validation: for i in range(args.inference_batch_size): _pflow = output[i].data.cpu( ).numpy().transpose(1, 2, 0) # print(_pflow.shape) img0 = scipy.misc.toimage(_pflow[:, :, 0]) img1 = scipy.misc.toimage(_pflow[:, :, 1]) #id = name[batch_idx][15:-4] id = image_list[1][0].split('/')[-1] #print('Saving : ', id) if not os.path.exists(crop_image_path + '/horizontal/' + city_name): os.makedirs(crop_image_path + '/horizontal/' + city_name) if not os.path.exists(crop_image_path + '/vertical/' + city_name): os.makedirs(crop_image_path + '/vertical/' + city_name) if not os.path.exists(crop_image_path + '/horizontal/' + city_name + '/' + video_name): os.makedirs( crop_image_path + '/horizontal/' + city_name + '/' + video_name) if not os.path.exists(crop_image_path + '/vertical/' + city_name + '/' + video_name): os.makedirs( crop_image_path + '/vertical/' + city_name + '/' + video_name) #block.log('?????? %s '%(name[batch_idx * args.effective_inference_batch_size + i])) img0.save( crop_image_path + '/horizontal/' + city_name + '/' + video_name + '/' + id) img1.save(crop_image_path + '/vertical/' + city_name + '/' + video_name + '/' + id) # img0.save( # '/home/wangsen/flownet2_testpic/optical_flow/horizontal/' + str(id) + '_' + str( # batch_idx) + '.png') # img1.save( # '/home/wangsen/flownet2_testpic/optical_flow/vertical/' + str(id) + '_' + str( # batch_idx) + '.png') #flow_utils.writeFlow(join(flow_folder, '%06d.flo' % (batch_idx * args.effective_inference_batch_size + i)), # _pflow) # You can comment out the plt block in visulize_flow_file() for real-time visualization if args.inference_visualize: flow_utils.visulize_flow_file( join(flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)), flow_vis_folder) progress.set_description( 'Inference Averages for Epoch {}: '.format(epoch) + tools.format_dictionary_of_losses(loss_labels, np.array( statistics).mean( axis=0))) progress.update(1) if batch_idx == (args.inference_n_batches - 1): break progress.close() return
def train(input_args, train_epoch, start_iteration, files_loader, model, model_optimizer, logger, is_validate=False, offset=0): statistics = [] total_loss = 0 if is_validate: model.eval() title = 'Validating Epoch {}'.format(train_epoch) input_args.validation_n_batches = np.inf if input_args.validation_n_batches < 0 else input_args.validation_n_batches file_progress = tqdm(tools.IteratorTimer(files_loader), ncols=100, total=np.minimum( len(files_loader), input_args.validation_n_batches), leave=True, position=offset, desc=title) else: model.train() title = 'Training Epoch {}'.format(train_epoch) input_args.train_n_batches = np.inf if input_args.train_n_batches < 0 else input_args.train_n_batches file_progress = tqdm(tools.IteratorTimer(files_loader), ncols=120, total=np.minimum(len(files_loader), input_args.train_n_batches), smoothing=.9, miniters=1, leave=True, position=offset, desc=title) last_log_time = file_progress._time() for batch_idx, (data_file) in enumerate(file_progress): video_dataset = datasets_video.VideoFileDataJIT( input_args, data_file[0]) video_loader = DataLoader(video_dataset, batch_size=args.effective_batch_size, shuffle=True, **gpuargs) global_iteration = start_iteration + batch_idx # note~ for debugging purposes # video_frame_progress = tqdm(tools.IteratorTimer(video_loader), ncols=120, # total=len(video_loader), smoothing=0.9, miniters=1, # leave=True, desc=data_file[0]) for i_batch, (data, target) in enumerate(video_loader): data, target = [Variable(d) for d in data], [Variable(t) for t in target] if input_args.cuda and input_args.number_gpus == 1: data, target = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target] model_optimizer.zero_grad() if not is_validate else None losses = model(data[0], target[0]) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.data loss_values = [v.data for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) assert not np.isnan(total_loss.cpu().numpy()) if not is_validate and input_args.fp16: loss_val.backward() if input_args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), input_args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as( params[i]).detach() param_copy[i].grad.mul_(1. / input_args.loss_scale) model_optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if input_args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), input_args.gradient_clip) model_optimizer.step() # Update hyperparameters if needed if not is_validate: tools.update_hyperparameter_schedule( input_args, train_epoch, global_iteration, model_optimizer) loss_labels.append('lr') loss_values.append(model_optimizer.param_groups[0]['lr']) loss_labels.append('load') loss_values.append(file_progress.iterable.last_duration) # Print out statistics statistics.append(loss_values) title = '{} Epoch {}'.format( 'Validating' if is_validate else 'Training', train_epoch) file_progress.set_description( title + ' ' + tools.format_dictionary_of_losses( tools.flatten_list(loss_labels), statistics[-1])) if ((((global_iteration + 1) % input_args.log_frequency) == 0 and not is_validate) or (is_validate and batch_idx == input_args.validation_n_batches - 1)): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar( 'batch logs per second', len(statistics) / (file_progress._time() - last_log_time), global_iteration) last_log_time = file_progress._time() all_losses = np.array(statistics) for i, key in enumerate(tools.flatten_list(loss_labels)): if isinstance(all_losses[:, i].item(), torch.Tensor): average_batch = all_losses[:, i].item().mean() else: average_batch = all_losses[:, i].item() logger.add_scalar('average batch ' + str(key), average_batch, global_iteration) logger.add_histogram(str(key), all_losses[:, i], global_iteration) # Reset Summary statistics = [] if is_validate and (batch_idx == input_args.validation_n_batches): break if (not is_validate) and (batch_idx == (input_args.train_n_batches)): break file_progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)
def train(args, epoch, start_iteration, data_loader, model, optimizer, logger, is_validate=False, offset=0): statistics = [] total_loss = 0 if is_validate: model.eval() title = 'Validating Epoch {}'.format(epoch) args.validation_n_batches = len( data_loader ) - 1 if args.validation_n_batches < 0 else args.validation_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=100, total=np.minimum(len(data_loader), args.validation_n_batches), leave=True, position=offset, desc=title) else: model.train() title = 'Training Epoch {}'.format(epoch) args.train_n_batches = len( data_loader ) - 1 if args.train_n_batches < 0 else args.train_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=120, total=np.minimum(len(data_loader), args.train_n_batches), smoothing=.9, miniters=1, leave=True, position=offset, desc=title) last_log_time = progress._time() for batch_idx, (data, target) in enumerate(progress): data, target = [Variable(d, volatile=is_validate) for d in data], [ Variable(t, volatile=is_validate) for t in target ] if args.cuda and args.number_gpus == 1: data, target = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target] optimizer.zero_grad() if not is_validate else None losses = model(data[0], target[0]) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[1] # Collect first loss for weight update total_loss += loss_val.data[0] loss_values = [v.data[0] for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) assert not np.isnan(total_loss) if not is_validate and args.fp16: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as( params[i]).detach() param_copy[i].grad.mul_(1. / args.loss_scale) optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) optimizer.step() # Update hyperparameters if needed global_iteration = start_iteration + batch_idx if not is_validate: tools.update_hyperparameter_schedule(args, epoch, global_iteration, optimizer) loss_labels.append('lr') loss_values.append(optimizer.param_groups[0]['lr']) loss_labels.append('load') loss_values.append(progress.iterable.last_duration) # Print out statistics statistics.append(loss_values) title = '{} Epoch {}'.format( 'Validating' if is_validate else 'Training', epoch) if (type(loss_labels[0]) is list) or (type(loss_labels[0]) is tuple): progress.set_description(title + ' ' + tools.format_dictionary_of_losses( loss_labels[0], statistics[-1])) else: progress.set_description(title + ' ' + tools.format_dictionary_of_losses( loss_labels, statistics[-1])) if ((((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or (is_validate and batch_idx == args.validation_n_batches - 1)): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar( 'batch logs per second', len(statistics) / (progress._time() - last_log_time), global_iteration) last_log_time = progress._time() all_losses = np.array(statistics) for i, key in enumerate(loss_labels[0] if ( type(loss_labels[0]) is list) or ( type(loss_labels[0]) is tuple) else loss_labels): logger.add_scalar('average batch ' + str(key), all_losses[:, i].mean(), global_iteration) #logger.add_histogram(str(key), all_losses[:, i], global_iteration) if is_validate: _, output = model(data[0], target[0], inference=True) render_flow = output[0].data.cpu().numpy().transpose( 1, 2, 0) ground_truth = target[0][0].data.cpu().numpy().transpose( 1, 2, 0) render_img = tools.flow_to_image(render_flow).transpose( 2, 0, 1) true_img = tools.flow_to_image(ground_truth).transpose( 2, 0, 1) render_img = torch.Tensor(render_img) / 255.0 true_img = torch.Tensor(true_img) / 255.0 input_img = data[0][0, :, 0, :, :].data.cpu() / 255.0 logger.add_image('renderimg', torchvision.utils.make_grid(render_img), global_iteration) logger.add_image('ground_truth', torchvision.utils.make_grid(true_img), global_iteration) logger.add_image('input_img', torchvision.utils.make_grid(input_img), global_iteration) # Reset Summary statistics = [] if (is_validate and (batch_idx == args.validation_n_batches)): break if ((not is_validate) and (batch_idx == (args.train_n_batches))): break progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)
def inference(args, epoch, data_loader, model, offset=0): model.eval() # if args.save_prefeat: # feat_folder = 'prefeat_kitti_odo' # if not os.path.exists(feat_folder): os.mkdir(feat_folder) # feat_folder = 'prefeat_kitti_odo/{}'.format(args.model) # if not os.path.exists(feat_folder): os.mkdir(feat_folder) # for _seq in ['00', '01', '02', '04', '05', '06', '07', '08', '09', '10']: # feat_folder = 'prefeat_kitti_odo/{}/{}'.format(args.model, _seq) # if not os.path.exists(feat_folder): os.mkdir(feat_folder) if args.save_flow or args.render_validation: flow_folder = "{}/inference/{}.epoch-{}-flow-field".format(args.save,args.name.replace('/', '.'),epoch) if not os.path.exists(flow_folder): os.makedirs(flow_folder) # visualization folder if args.inference_visualize: flow_vis_folder = "{}/inference/{}.epoch-{}-flow-vis".format(args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_vis_folder): os.makedirs(flow_vis_folder) args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches progress = tqdm(data_loader, ncols=100, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing ', leave=True, position=offset) statistics = [] total_loss = 0 for batch_idx, (data, target, outname) in enumerate(progress): # # outname: e.g. [['00-000000_000001']] # outname = outname[0][0] # '00-000000_000001' # outseq = outname.split('-')[0] # '00' if args.cuda: data, target = [d.cuda(non_blocking=True) for d in data], [t.cuda(non_blocking=True) for t in target] data, target = [Variable(d) for d in data], [Variable(t) for t in target] # when ground-truth flows are not available for inference_dataset, # the targets are set to all zeros. thus, losses are actually L1 or L2 norms of compute optical flows, # depending on the type of loss norm passed in with torch.no_grad(): losses, output, out_feats = model(data[0], target[0], inference=True) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) statistics.append(loss_values) # import IPython; IPython.embed() # if args.save_prefeat: # torch.save(out_feats, 'prefeat_kitti_odo/{}/{}/{}.pt'.format(args.model, outseq, outname)) if args.save_flow or args.render_validation: for i in range(args.inference_batch_size): _pflow = output[i].data.cpu().numpy().transpose(1, 2, 0) flow_utils.writeFlow( join(flow_folder, '%06d.flo'%(batch_idx * args.inference_batch_size + i)), _pflow) # You can comment out the plt block in visulize_flow_file() for real-time visualization if args.inference_visualize: flow_utils.visulize_flow_file( join(flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)),flow_vis_folder) progress.set_description('Inference Averages for Epoch {}: '.format(epoch) + tools.format_dictionary_of_losses(loss_labels, np.array(statistics).mean(axis=0))) progress.update(1) if batch_idx == (args.inference_n_batches - 1): break progress.close() return
def train(args, epoch, start_iteration, data_loader, model, optimizer, logger, is_validate=False, offset=0): statistics = [] total_loss = 0 if is_validate: model.eval() title = "Validating Epoch {}".format(epoch) args.validation_n_batches = np.inf if args.validation_n_batches < 0 else args.validation_n_batches progress = tqdm( tools.IteratorTimer(data_loader), ncols=100, total=np.minimum(len(data_loader), args.validation_n_batches), leave=True, position=offset, desc=title, ) else: model.train() title = "Training Epoch {}".format(epoch) args.train_n_batches = np.inf if args.train_n_batches < 0 else args.train_n_batches progress = tqdm( tools.IteratorTimer(data_loader), ncols=120, total=np.minimum(len(data_loader), args.train_n_batches), smoothing=0.9, miniters=1, leave=True, position=offset, desc=title, ) last_log_time = progress._time() for batch_idx, (data, target) in enumerate(progress): data, target = [Variable(d) for d in data], [Variable(t) for t in target] if args.cuda and args.number_gpus == 1: data, target = [d.cuda(non_blocking=True) for d in data], [t.cuda(non_blocking=True) for t in target] optimizer.zero_grad() if not is_validate else None losses = model(data[0], target[0]) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) assert not np.isnan(total_loss) if not is_validate and args.fp16: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as(params[i]).detach() param_copy[i].grad.mul_(1.0 / args.loss_scale) optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) optimizer.step() # Update hyperparameters if needed global_iteration = start_iteration + batch_idx if not is_validate: tools.update_hyperparameter_schedule(args, epoch, global_iteration, optimizer) loss_labels.append("lr") loss_values.append(optimizer.param_groups[0]["lr"]) loss_labels.append("load") loss_values.append(progress.iterable.last_duration) # Print out statistics statistics.append(loss_values) title = "{} Epoch {}".format("Validating" if is_validate else "Training", epoch) progress.set_description(title + " " + tools.format_dictionary_of_losses(loss_labels, statistics[-1])) if (((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or ( is_validate and batch_idx == args.validation_n_batches - 1 ): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar( "batch logs per second", len(statistics) / (progress._time() - last_log_time), global_iteration ) last_log_time = progress._time() all_losses = np.array(statistics) for i, key in enumerate(loss_labels): logger.add_scalar("average batch " + str(key), all_losses[:, i].mean(), global_iteration) logger.add_histogram(str(key), all_losses[:, i], global_iteration) # Reset Summary statistics = [] if is_validate and (batch_idx == args.validation_n_batches): break if (not is_validate) and (batch_idx == (args.train_n_batches)): break progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)
def inference(args, epoch, data_loader, model, offset=0): if os.path.exists( join( '/usr/xtmp/ct214/daml/vr_sickness/perspective_skyhouse_of_results/', args.name + ".npy") ) and not args.name == "right_eye_theta_0_phi_-82.5": print( "seen ", join( '/usr/xtmp/ct214/daml/vr_sickness/perspective_skyhouse_of_results/', args.name)) return model.eval() tosave = np.zeros((1800, 2)) print("args name is !!!!!!!!!!!!!!!!!!!!!!", args.name) if args.save_flow or args.render_validation: flow_folder = "{}/inference/{}.epoch-{}-flow-field".format( args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_folder): os.makedirs(flow_folder) # visualization folder if args.inference_visualize: flow_vis_folder = "{}/inference/{}.epoch-{}-flow-vis".format( args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_vis_folder): os.makedirs(flow_vis_folder) args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches progress = tqdm(data_loader, ncols=100, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing ', leave=True, position=offset) statistics = [] total_loss = 0 for batch_idx, (data, target) in enumerate(progress): if args.cuda: data, target = [d.cuda(non_blocking=True) for d in data ], [t.cuda(non_blocking=True) for t in target] data, target = [Variable(d) for d in data], [Variable(t) for t in target] # when ground-truth flows are not available for inference_dataset, # the targets are set to all zeros. thus, losses are actually L1 or L2 norms of compute optical flows, # depending on the type of loss norm passed in with torch.no_grad(): losses, output = model(data[0], target[0], inference=True) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) statistics.append(loss_values) # import IPython; IPython.embed() if args.save_flow or args.render_validation: for i in range(args.inference_batch_size): _pflow = output[i].data.cpu().numpy().transpose(1, 2, 0) avg_pflow = np.average(_pflow, axis=(0, 1)) tosave[batch_idx * args.inference_batch_size + i] = avg_pflow # flow_utils.writeFlow( join(flow_folder, '%06d.flo'%(batch_idx * args.inference_batch_size + i)), _pflow) progress.set_description( 'Inference Averages for Epoch {}: '.format(epoch) + tools.format_dictionary_of_losses( loss_labels, np.array(statistics).mean(axis=0))) progress.update(1) if batch_idx == (args.inference_n_batches - 1): break progress.close() np.save( join( '/usr/xtmp/ct214/daml/vr_sickness/perspective_skyhouse_of_results/', args.name), tosave) return
def inference(args, epoch, data_loader, model, offset=0): model.eval() if args.save_flow or args.render_validation: flow_folder = "{}/inference/{}.epoch-{}-flow-field".format( args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_folder): os.makedirs(flow_folder) # visualization folder if args.inference_visualize: flow_vis_folder = "{}/inference/{}.epoch-{}-flow-vis".format( args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_vis_folder): os.makedirs(flow_vis_folder) if args.save_frames or args.save_inferenceLog: inference_folder = "{}/{}.epoch-{}".format( args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(inference_folder): os.makedirs(inference_folder) args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches progress = tqdm(data_loader, ncols=100, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing ', leave=True, position=offset) print('[LOG] We assume that "inference_batch_size" arg is always 1') if data_loader.dataset.ref_names == None: f_names = [f'{f_idx:06d}.png' for f_idx in range(len(data_loader))] else: f_names = data_loader.dataset.ref_names if args.save_inferenceLog: log_labels = ['filename'] + list(model.module.loss.loss_labels) log_dict = {l: {} for l in log_labels} for i in range(len(data_loader)): log_dict['filename'][i] = f_names[i] statistics = [] total_loss = 0 for batch_idx, (data, target) in enumerate(progress): if args.cuda: data, target = [d.cuda(non_blocking=True) for d in data ], [t.cuda(non_blocking=True) for t in target] data, target = [Variable(d) for d in data], [Variable(t) for t in target] # when ground-truth flows are not available for inference_dataset, # the targets are set to all zeros. thus, losses are actually L1 or L2 norms of compute optical flows, # depending on the type of loss norm passed in with torch.no_grad(): pred_losses, output = model(data[0], target[0], inference=True) losses = [torch.mean(loss_value) for loss_value in pred_losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) statistics.append(loss_values) # import IPython; IPython.embed() if args.save_flow or args.render_validation: for i in range(args.inference_batch_size): _pflow = output[i].data.cpu().numpy().transpose(1, 2, 0) flow_utils.writeFlow( join( flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)), _pflow) # You can comment out the plt block in visulize_flow_file() for real-time visualization if args.inference_visualize: flow_utils.visulize_flow_file( join( flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)), flow_vis_folder) if args.save_frames: from PIL import Image _pframe = output[0].data.cpu().numpy().transpose(1, 2, 0) _pframe = (_pframe).clip(min=0, max=255).astype(np.uint8) f_name = f_names[batch_idx] png_data = Image.fromarray(_pframe) png_data.save(f'{inference_folder}/{f_name}') if args.save_inferenceLog: for label, loss in zip(loss_labels, pred_losses): log_dict[label][batch_idx] = str(loss.cpu().numpy()) progress.set_description( 'Inference Averages for Epoch {}: '.format(epoch) + tools.format_dictionary_of_losses( loss_labels, np.array(statistics).mean(axis=0))) progress.update(1) if batch_idx == (args.inference_n_batches - 1): break progress.close() if args.save_inferenceLog: import json with open(f'{inference_folder}/log.json', 'w') as fp: json.dump(log_dict, fp, sort_keys=True, indent=4) return
def inference(args, epoch, data_loader, model, offset=0): model.eval() if args.save_flow or args.render_validation: flow_folder = "{}/inference/{}.epoch-{}-flow-field".format( args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_folder): os.makedirs(flow_folder) # visualization folder if args.inference_visualize: flow_vis_folder = "{}/inference/{}.epoch-{}-flow-vis".format( args.save, args.name.replace('/', '.'), epoch) if not os.path.exists(flow_vis_folder): os.makedirs(flow_vis_folder) args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches progress = tqdm(data_loader, ncols=200, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing ', leave=True, position=offset) statistics = [] total_loss = 0 for batch_idx, (data, target) in enumerate(progress): if args.cuda: data, target = [d.cuda(non_blocking=True) for d in data ], [t.cuda(non_blocking=True) for t in target] data, target = [Variable(d) for d in data], [Variable(t) for t in target] # when ground-truth flows are not available for inference_dataset, # the targets are set to all zeros. thus, losses are actually L1 or L2 norms of compute optical flows, # depending on the type of loss norm passed in with torch.no_grad(): losses, output = model(data[0], target[0], inference=True) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) statistics.append(loss_values) # import IPython; IPython.embed() if args.save_flow or args.render_validation: for i in range(args.inference_batch_size): _pflow_all = output[i].data.cpu().numpy().transpose( 1, 2, 0) _tflow_all = target[0][i].data.cpu().numpy() if len(_tflow_all.shape) == 4: _tflow_all = _tflow_all.transpose(1, 2, 3, 0) elif len(_tflow_all.shape) == 3: _tflow_all = _tflow_all.transpose(1, 2, 0) else: ValueError('Unsupported dimensions of _tflow_all') for j in range(0, output.shape[1], 2): _pflow = _pflow_all[:, :, j:j + 2] if len(_tflow_all.shape) == 4: _tflow = _tflow_all[int(j / 2), :, :, :] elif len(_tflow_all.shape) == 3: _tflow = _tflow_all else: ValueError('Unsupported dimensions of _tflow_all') flow_filename_base = '%06d_%06d' % ( batch_idx * args.inference_batch_size + i, int(j / 2)) flow_utils.writeFlow( join(flow_folder, flow_filename_base) + '.flo', _pflow) # You can comment out the plt block in visulize_flow_file() for real-time visualization # if args.inference_visualize: # flow_utils.visulize_flow_file( # join(flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)),flow_vis_folder) flow_utils.writeFlow( join(flow_folder, flow_filename_base + '_target.flo'), _tflow) # You can comment out the plt block in visulize_flow_file() for real-time visualization if args.inference_visualize: # flow_utils.visulize_flow_file_and_target( # join(flow_folder, '%06d.flo' % (batch_idx * args.inference_batch_size + i)), # join(flow_folder, '%06d_target.flo' % (batch_idx * args.inference_batch_size + i)), # flow_vis_folder) results_image = visualize_results( _pflow, _tflow, data[0][i]) cv2.imwrite( join(flow_vis_folder, flow_filename_base + '_vis.png'), cv2.cvtColor(results_image, cv2.COLOR_RGB2BGR)) progress.set_description( 'Inference Averages for Epoch {}: '.format(epoch) + tools.format_dictionary_of_losses( loss_labels, np.array(statistics).mean(axis=0))) progress.update(1) if batch_idx == (args.inference_n_batches - 1): break progress.close() return
def train(args, epoch, start_iteration, data_loader, model, optimizer, logger, is_validate=False, offset=0): statistics = [] all_gradient_norms = [] total_loss = 0 if is_validate: model.eval() title = 'Validating Epoch {}'.format(epoch) args.validation_n_batches = np.inf if args.validation_n_batches < 0 else args.validation_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=200, total=np.minimum(len(data_loader), args.validation_n_batches), leave=True, position=offset, desc=title) else: model.train() title = 'Training Epoch {}'.format(epoch) args.train_n_batches = np.inf if args.train_n_batches < 0 else args.train_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=200, total=np.minimum(len(data_loader), args.train_n_batches), smoothing=.9, miniters=1, leave=True, position=offset, desc=title) last_log_time = progress._time() for batch_idx, (data, target) in enumerate(progress): data, target = [Variable(d) for d in data], [Variable(t) for t in target] if args.cuda and args.number_gpus == 1: data, target = [d.cuda(non_blocking=True) for d in data ], [t.cuda(non_blocking=True) for t in target] optimizer.zero_grad() if not is_validate else None losses, flow = model(data[0], target[0]) #print('Losses shape {} {}'.format(losses[0].shape, losses[1].shape)) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] loss_labels = list(model.module.loss.loss_labels) assert not np.isnan(total_loss) if not is_validate and args.fp16: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as( params[i]).detach() param_copy[i].grad.mul_(1. / args.loss_scale) optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if args.gradient_clip: gradient_norm = torch.nn.utils.clip_grad_norm( model.parameters(), args.gradient_clip) all_gradient_norms.append(gradient_norm) optimizer.step() # Update hyperparameters if needed global_iteration = start_iteration + batch_idx if not is_validate: tools.update_hyperparameter_schedule(args, epoch, global_iteration, optimizer) loss_labels.append('lr') loss_values.append(optimizer.param_groups[0]['lr']) loss_labels.append('load') loss_values.append(progress.iterable.last_duration) # Print out statistics statistics.append(loss_values) title = '{} Epoch {}'.format( 'Validating' if is_validate else 'Training', epoch) progress.set_description( title + ' ' + tools.format_dictionary_of_losses(loss_labels, statistics[-1])) if ((((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or (is_validate and batch_idx == args.validation_n_batches - 1)): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar( 'batch logs per second', len(statistics) / (progress._time() - last_log_time), global_iteration) last_log_time = progress._time() all_losses = np.array(statistics) for i, key in enumerate(loss_labels): logger.add_scalar('average batch ' + str(key), all_losses[:, i].mean(), global_iteration) logger.add_histogram(str(key), all_losses[:, i], global_iteration) if args.gradient_clip: logger.add_scalar('average batch gradient_norm', np.array(all_gradient_norms).mean(), global_iteration) all_gradient_norms = [] # Returns multiscale flow, get largest scale and first element in batch if args.multiframe or args.multiframe_two_output: flow = flow_utils.flow_postprocess(flow)[0][0] num_flows = len(args.frame_weights) flows_scaled = [ cv2.resize(flow[:, :, i:i + 2], None, fx=4.0, fy=4.0) for i in range(0, 2 * num_flows, 2) ] target = target[0].detach().cpu().numpy() target_flow = np.transpose(target[0], (1, 2, 3, 0)) results_images = [ visualize_results(flows_scaled[i], target_flow[i], data[0][0] if i == 0 else None) for i in range(0, num_flows) ] for i in range(0, num_flows): logger.add_image('flow{} and target'.format(i), ToTensor()(results_images[i]), global_iteration) else: flow = flow_utils.flow_postprocess(flow)[0][0] flow_scaled = cv2.resize(flow, None, fx=4.0, fy=4.0) target_flow = flow_utils.flow_postprocess(target)[0][0] results_image = visualize_results(flow_scaled, target_flow, data[0][0]) logger.add_image('flow and target', ToTensor()(results_image), global_iteration) # logger.add_histogram('flow_values', flow[0], global_iteration) # Reset Summary statistics = [] if (is_validate and (batch_idx == args.validation_n_batches)): break if ((not is_validate) and (batch_idx == (args.train_n_batches))): break progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)
def train(args, epoch, start_iteration, data_loader, model, optimizer, logger, is_validate=False, offset=0): statistics = [] total_loss = 0 if is_validate: model.eval() title = 'Validating Epoch {}'.format(epoch) #print("validation_n_batches", args.validation_n_batches) args.validation_n_batches = np.inf if args.validation_n_batches < 0 else args.validation_n_batches #print("validation_n_batches", args.validation_n_batches) progress = tqdm(tools.IteratorTimer(data_loader), ncols=100, total=np.minimum(len(data_loader), args.validation_n_batches), leave=True, position=offset, desc=title) else: model.train() title = 'Training Epoch {}'.format(epoch) args.train_n_batches = np.inf if args.train_n_batches < 0 else args.train_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=120, total=np.minimum(len(data_loader), args.train_n_batches), smoothing=.9, miniters=1, leave=True, position=offset, desc=title) last_log_time = progress._time() for batch_idx, (data, target) in enumerate(progress): data, target = [Variable(d) for d in data], [Variable(t) for t in target] if args.cuda and args.number_gpus == 1: data, target = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target] optimizer.zero_grad() if not is_validate else None #print("this is data type",data[0].type()) #print("\n") #print("this is target type",target[0].type()) #print("\n") losses = model(data[0], target[0]) losses = [torch.mean(loss_value) for loss_value in losses] # taking mean of batches loss_val = losses[ 0] # Collect first loss for weight update #take first loss, second is EPE total_loss += loss_val.data.cpu() loss_values = [v.data.cpu() for v in losses] #collect loss values # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' #loss_labels = [y for x in model.module.loss.loss_labels for y in x] #list(model.module.loss.loss_labels) loss_labels = list(model.module.loss.loss_labels) assert not np.isnan(total_loss.cpu()) if not is_validate and args.fp16: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as( params[i]).detach() param_copy[i].grad.mul_(1. / args.loss_scale) optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) optimizer.step() # Update hyperparameters if needed global_iteration = start_iteration + batch_idx if not is_validate: tools.update_hyperparameter_schedule(args, epoch, global_iteration, optimizer) loss_labels.append('lr') loss_values.append(optimizer.param_groups[0]['lr']) loss_labels.append('load') loss_values.append(progress.iterable.last_duration) #add load #if is_validate: # print("this is EPE length", len(loss_values[:,1])) # Print out statistics #if is_validate: # print(statistics) statistics.append(loss_values) #if is_validate: # print(statistics) title = '{} Epoch {}'.format( 'Validating' if is_validate else 'Training', epoch) progress.set_description( title + ' ' + tools.format_dictionary_of_losses(loss_labels, statistics[-1])) #if is_validate: # print(batch_idx) # args.log_frequency == 1 by default if ((((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or is_validate and batch_idx == min(args.validation_n_batches, len(data_loader) - 1)): #if ((((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or (is_validate and batch_idx == args.validation_n_batches - 1)): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar( 'batch logs per second', len(statistics) / (progress._time() - last_log_time), global_iteration) last_log_time = progress._time() all_losses = np.array(statistics) #if is_validate: # print(all_losses) for i, key in enumerate(loss_labels): logger.add_scalar('average batch ' + str(key), all_losses[:, i].mean(), global_iteration) logger.add_histogram(str(key), all_losses[:, i], global_iteration) # Reset Summary statistics = [] if (is_validate and (batch_idx == args.validation_n_batches)): break if ((not is_validate) and (batch_idx == (args.train_n_batches))): break progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)
def train(args, epoch, start_iteration, data_loader, model, optimizer, logger, is_validate=False, offset=0): #print(str(model)) statistics = [] total_loss = 0 debug = False if is_validate: model.eval() title = 'Validating Epoch {}'.format(epoch) args.validation_n_batches = np.inf if args.validation_n_batches < 0 else args.validation_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=100, total=np.minimum(len(data_loader), args.validation_n_batches), leave=True, position=offset, desc=title) else: model.train() title = 'Training Epoch {}'.format(epoch) args.train_n_batches = np.inf if args.train_n_batches < 0 else args.train_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=120, total=np.minimum(len(data_loader), args.train_n_batches), smoothing=.9, miniters=1, leave=True, position=offset, desc=title) last_log_time = progress._time() for batch_idx, (data, target, cdm) in enumerate(progress): data, target, cdm = [ Variable(d, volatile=is_validate) for d in data ], [Variable(t, volatile=is_validate) for t in target ], [Variable(q, volatile=is_validate) for q in cdm] if args.cuda and args.number_gpus == 1: data, target, cdm = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target ], [q.cuda(async=True) for q in cdm] if debug: print( '****************************************************************' ) print('data_0') print(data[0]) print('target_0') print(target[0]) print('cdm') print(type(cdm)) temp1 = cdm[0].data.cpu().numpy() print(np.max(temp1)) print(temp1.shape) print( '****************************************************************' ) optimizer.zero_grad() if not is_validate else None losses = model(data[0], target[0]) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update #A[batch_idx] = loss_val.data[0] #np.savetxt('test_loss.out', np.array(A) , delimiter=',' , newline='\r\n' ) total_loss += loss_val.data[0] loss_values = [v.data[0] for v in losses] # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) assert not np.isnan(total_loss) if not is_validate and args.fp16: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as( params[i]).detach() param_copy[i].grad.mul_(1. / args.loss_scale) optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) optimizer.step() # Update hyperparameters if needed global_iteration = start_iteration + batch_idx if not is_validate: tools.update_hyperparameter_schedule(args, epoch, global_iteration, optimizer) loss_labels.append('lr') loss_values.append(optimizer.param_groups[0]['lr']) loss_labels.append('load') loss_values.append(progress.iterable.last_duration) # Print out statistics statistics.append(loss_values) title = '{} Epoch {}'.format( 'Validating' if is_validate else 'Training', epoch) progress.set_description( title + ' ' + tools.format_dictionary_of_losses(loss_labels, statistics[-1])) if ((((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or (is_validate and batch_idx == args.validation_n_batches - 1)): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar( 'batch logs per second', len(statistics) / (progress._time() - last_log_time), global_iteration) last_log_time = progress._time() all_losses = np.array(statistics) for i, key in enumerate(loss_labels): logger.add_scalar('average batch ' + str(key), all_losses[:, i].mean(), global_iteration) logger.add_histogram(str(key), all_losses[:, i], global_iteration) # Reset Summary statistics = [] if (is_validate and (batch_idx == args.validation_n_batches)): break if ((not is_validate) and (batch_idx == (args.train_n_batches))): break progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)
def train(args, epoch, start_iteration, data_loader, model, optimizer, scheduler, logger, is_validate=False, offset=0, max_flows_to_show=8): running_statistics = None # Initialize below when the first losses are collected all_losses = None # Initialize below when the first losses are collected total_loss = 0 if is_validate: model.eval() title = 'Validating Epoch {}'.format(epoch) args.validation_n_batches = np.inf if args.validation_n_batches < 0 else args.validation_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=100, total=np.minimum(len(data_loader), args.validation_n_batches), leave=True, position=offset, desc=title) else: model.train() title = 'Training Epoch {}'.format(epoch) args.train_n_batches = np.inf if args.train_n_batches < 0 else args.train_n_batches progress = tqdm(tools.IteratorTimer(data_loader), ncols=120, total=np.minimum(len(data_loader), args.train_n_batches), smoothing=.9, miniters=1, leave=True, position=offset, desc=title) def convert_flow_to_image(flow_converter, flows_viz): imgs = [] for flow_pair in flows_viz: for flow in flow_pair: flow = flow.numpy().transpose((1, 2, 0)) img = flow_converter._flowToColor(flow) imgs.append(torch.from_numpy(img.transpose((2, 0, 1)))) epe_img = torch.sqrt( torch.sum(torch.pow(flow_pair[0] - flow_pair[1], 2), dim=0)) max_epe = torch.max(epe_img) if max_epe == 0: max_epe = torch.ones(1) normalized_epe_img = epe_img / max_epe normalized_epe_img = (255 * normalized_epe_img).type( torch.uint8) normalized_epe_img = torch.stack( (normalized_epe_img, normalized_epe_img, normalized_epe_img), dim=0) imgs.append(normalized_epe_img) saturated_epe_img = torch.min(epe_img, 5.0 * torch.ones_like(epe_img)) saturated_epe_img = (51 * saturated_epe_img).type(torch.uint8) saturated_epe_img = torch.stack( (saturated_epe_img, saturated_epe_img, saturated_epe_img), dim=0) imgs.append(saturated_epe_img) return imgs max_iters = min(len(data_loader), (args.validation_n_batches if (is_validate and args.validation_n_batches > 0) else len(data_loader)), (args.train_n_batches if (not is_validate and args.train_n_batches > 0) else len(data_loader))) if is_validate: flow_converter = f2i.Flow() collect_flow_interval = int( np.ceil(float(max_iters) / max_flows_to_show)) flows_viz = [] last_log_batch_idx = 0 last_log_time = progress._time() for batch_idx, (data, target) in enumerate(progress): global_iteration = start_iteration + batch_idx data, target = [Variable(d) for d in data], [Variable(t) for t in target] if args.cuda and args.number_gpus == 1: data, target = [d.cuda() for d in data], [t.cuda() for t in target] optimizer.zero_grad() if not is_validate else None losses, output = model(data[0], target[0], inference=True) losses = [torch.mean(loss_value) for loss_value in losses] loss_val = losses[0] # Collect first loss for weight update total_loss += loss_val.item() loss_values = [v.item() for v in losses] if is_validate and batch_idx % collect_flow_interval == 0: flows_viz.append( (target[0][0].detach().cpu(), output[0].detach().cpu())) if is_validate and args.validation_log_images and batch_idx == ( max_iters - 1): imgs = convert_flow_to_image(flow_converter, flows_viz) imgs = torchvision_utils.make_grid(imgs, nrow=4, normalize=False, scale_each=False) logger.add_image('target/predicted flows', imgs, global_iteration) # gather loss_labels, direct return leads to recursion limit error as it looks for variables to gather' loss_labels = list(model.module.loss.loss_labels) assert not np.isnan(total_loss) if not is_validate and args.fp16: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) params = list(model.parameters()) for i in range(len(params)): param_copy[i].grad = params[i].grad.clone().type_as( params[i]).detach() param_copy[i].grad.mul_(1. / args.loss_scale) optimizer.step() for i in range(len(params)): params[i].data.copy_(param_copy[i].data) elif not is_validate: loss_val.backward() if args.gradient_clip: torch.nn.utils.clip_grad_norm(model.parameters(), args.gradient_clip) optimizer.step() # Update hyperparameters if needed if not is_validate: scheduler.step() loss_labels.append('lr') loss_values.append(optimizer.param_groups[0]['lr']) loss_labels.append('load') loss_values.append(progress.iterable.last_duration) if running_statistics is None: running_statistics = np.array(loss_values) all_losses = np.zeros((len(data_loader), len(loss_values)), np.float32) else: running_statistics += np.array(loss_values) all_losses[batch_idx] = loss_values.copy() title = '{} Epoch {}'.format( 'Validating' if is_validate else 'Training', epoch) progress.set_description(title + ' ' + tools.format_dictionary_of_losses( loss_labels, running_statistics / (batch_idx + 1))) if ((((global_iteration + 1) % args.log_frequency) == 0 and not is_validate) or (batch_idx == max_iters - 1)): global_iteration = global_iteration if not is_validate else start_iteration logger.add_scalar('batch logs per second', (batch_idx - last_log_batch_idx) / (progress._time() - last_log_time), global_iteration) last_log_time = progress._time() last_log_batch_idx = batch_idx for i, key in enumerate(loss_labels): logger.add_scalar('average batch ' + str(key), all_losses[:batch_idx + 1, i].mean(), global_iteration) logger.add_histogram(str(key), all_losses[:batch_idx + 1, i], global_iteration) if (is_validate and (batch_idx == args.validation_n_batches)): break if ((not is_validate) and (batch_idx == (args.train_n_batches))): break progress.close() return total_loss / float(batch_idx + 1), (batch_idx + 1)