def main(args): if args.rank == 0: log.basicConfig(level=log.INFO) writer = SummaryWriter() writer.add_text('config', str(args)) else: log.basicConfig(level=log.WARNING) writer = None torch.cuda.set_device(args.rank % args.world_size) torch.manual_seed(args.seed + args.rank) torch.cuda.manual_seed(args.seed + args.rank) torch.backends.cudnn.benchmark = True if args.world_size > 1: log.info('Initializing process group') dist.init_process_group(backend='nccl', init_method='tcp://' + args.ip + ':3567', world_size=args.world_size, rank=args.rank) log.info('Process group initialized') log.info('Initializing ' + args.loader + ' training dataloader...') train_loader, train_batches, sampler = get_loader(args, 'train') samples_per_epoch = train_batches * args.batchsize log.info('Dataloader initialized') model = VSRNet(args.frames, args.flownet_path, args.fp16) if args.fp16: network_to_half(model) model.cuda() model.train() for param in model.FlowNetSD_network.parameters(): param.requires_grad = False model_params = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adam(model_params, lr=1, weight_decay=args.weight_decay) stepsize = 2 * train_batches clr_lambda = cyclic_learning_rate(args.min_lr, args.max_lr, stepsize) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[clr_lambda]) if args.fp16: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) if args.world_size > 1: model = DistributedDataParallel(model) # TRAINING total_iter = 0 while total_iter * args.world_size < args.max_iter: epoch = floor(total_iter / train_batches) # only if we are using DistributedSampler if args.world_size > 1 and args.loader == 'pytorch': sampler.set_epoch(epoch) model.train() total_epoch_loss = 0.0 sample_timer = 0.0 data_timer = 0.0 compute_timer = 0.0 iter_start = time.clock() training_data_times = [] training_start = datetime.datetime.now() # TRAINING EPOCH LOOP for i, inputs in enumerate(train_loader): training_stop = datetime.datetime.now() dataloading_time = training_stop - training_start training_data_times.append(dataloading_time.total_seconds() * 1000.0) if args.loader == 'DALI': inputs = inputs[0]["data"] # Needed? It is already gpu inputs = inputs.cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) if args.fp16: inputs = inputs.half() if args.timing: torch.cuda.synchronize() data_end = time.clock() optimizer.zero_grad() im_out = total_iter % args.image_freq == 0 # writer.add_graph(model, inputs) loss = model(Variable(inputs), i, writer, im_out) total_epoch_loss += loss.item() if args.fp16: optimizer.backward(loss) else: loss.backward() optimizer.step() scheduler.step() if args.rank == 0: if args.timing: torch.cuda.synchronize() iter_end = time.clock() sample_timer += (iter_end - iter_start) data_duration = data_end - iter_start data_timer += data_duration compute_timer += (iter_end - data_end) torch.cuda.synchronize() iter_start = time.clock() writer.add_scalar('learning_rate', scheduler.get_lr()[0], total_iter) writer.add_scalar('train_loss', loss.item(), total_iter) log.info('Rank %d, Epoch %d, Iteration %d of %d, loss %.5f' % (args.rank, epoch, i + 1, train_batches, loss.item())) if total_iter % 100 == 0: print("Avg dataloading time: " + str( reduce(lambda x, y: x + y, training_data_times) / len(training_data_times)) + "ms") total_iter += 1 if total_iter > args.max_iter: break training_start = datetime.datetime.now() if args.rank == 0: if args.timing: sample_timer_avg = sample_timer / samples_per_epoch writer.add_scalar('sample_time', sample_timer_avg, total_iter) data_timer_avg = data_timer / samples_per_epoch writer.add_scalar('sample_data_time', data_timer_avg, total_iter) compute_timer_avg = compute_timer / samples_per_epoch writer.add_scalar('sample_compute_time', compute_timer_avg, total_iter) epoch_loss_avg = total_epoch_loss / train_batches log.info('Rank %d, epoch %d: %.5f' % (args.rank, epoch, epoch_loss_avg)) ### VALIDATION log.info('Initializing ' + args.loader + ' validation dataloader...') val_loader, val_batches, sampler = get_loader(args, 'val') model.eval() total_loss = 0 total_psnr = 0 for i, inputs in enumerate(val_loader): if args.loader == 'DALI': inputs = inputs[0]["data"] # Needed? It is already gpu inputs = inputs.cuda(non_blocking=True) else: inputs = inputs.cuda(non_blocking=True) if args.fp16: inputs = inputs.half() log.info('Validation it %d of %d' % (i + 1, val_batches)) loss, psnr = model(Variable(inputs), i, None) total_loss += loss.item() total_psnr += psnr.item() loss = total_loss / i psnr = total_psnr / i if args.rank == 0: writer.add_scalar('val_loss', loss, total_iter) writer.add_scalar('val_psnr', psnr, total_iter) log.info('Rank %d validation loss %.5f' % (args.rank, loss)) log.info('Rank %d validation psnr %.5f' % (args.rank, psnr))
def main(args): if args.rank == 0: log.basicConfig(level=log.INFO) writer = SummaryWriter() writer.add_text('config', str(args)) else: log.basicConfig(level=log.WARNING) writer = None # pipe = SimplePipeline(batch_size, 1, 0) # pipe.build() # pipe_out = pipe.run() # print(pipe_out) # print(pipe_out[0].at(0).as_shape()) # for i in range(3): # planar = np.array(pipe_out[0].at(0))[i, ...].squeeze().swapaxes(1, 2).swapaxes(0, 1) # print(planar.shape) # writer.add_image("Dali", planar, i) # pipe.build() # print(pipe.epoch_size()) # we have to extract epoch size from dict: # dali_iterator = pytorch.DALIGenericIterator(pipe, ["data"], list(pipe.epoch_size().values())[0]) # print(dali_iterator) torch.cuda.set_device(args.rank % args.world_size) torch.manual_seed(args.seed + args.rank) torch.cuda.manual_seed(args.seed + args.rank) torch.backends.cudnn.benchmark = True log.info('NOT Initializing process group') # dist.init_process_group( # backend='nccl', # init_method='tcp://' + args.ip + ':3567', # world_size=args.world_size, # rank=args.rank) log.info('Process group NOT initialized') log.info("Initializing dataloader...") train_loader, train_batches, val_loader, val_batches, sampler = get_loader( args) samples_per_epoch = train_batches * args.batchsize log.info('Dataloader initialized') model = VSRNet(args.frames, args.flownet_path, args.fp16) if args.fp16: network_to_half(model) model.cuda() model.train() for param in model.FlowNetSD_network.parameters(): param.requires_grad = False model_params = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adam(model_params, lr=1, weight_decay=args.weight_decay) #optimizer = optim.SGD(model_params, lr=1, # momentum=0.99, weight_decay=args.weight_decay) stepsize = 2 * train_batches clr_lambda = cyclic_learning_rate(args.min_lr, args.max_lr, stepsize) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[clr_lambda]) if args.fp16: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # model = DistributedDataParallel(model) # BEGIN TRAINING total_iter = 0 while total_iter * args.world_size < args.max_iter: epoch = floor(total_iter / train_batches) if args.loader == 'pytorch' and args.world_size > 1: sampler.set_epoch(epoch) model.train() total_epoch_loss = 0.0 sample_timer = 0.0 data_timer = 0.0 compute_timer = 0.0 iter_start = time.perf_counter() # TRAINING EPOCH LOOP for i, inputs in enumerate(train_loader): # print(inputs) if args.loader == 'NVVL': inputs = inputs['input'] elif args.loader == 'pytorch': inputs = inputs.cuda(non_blocking=True) if args.fp16: inputs = inputs.half() elif args.loader == 'DALI': # Take optupt from 1st pipeline inputs = inputs[0] # 1st output of pipeline for data category inputs = inputs["data"][0] print("Before", inputs.size()) # TODO - transformation - crop, transpose axes, return as floats # NFHWC to NCFHW np_tmp = inputs.numpy().swapaxes(3, 4).swapaxes(2, 3).swapaxes( 1, 2).astype(np.float32) np_tmp = np.ascontiguousarray( np_tmp[:, :, :, 14:526, :]) # crop to 512x960 inputs = torch.from_numpy(np_tmp) inputs = inputs.cuda(non_blocking=True) print("After", inputs.size()) # cpu_tmp = inputs.cpu() # for k in range(args.frames): # print("CC", cpu_tmp.numpy().shape) # planar = cpu_tmp.numpy()[0, :, k, :, :].squeeze() / 255.0 # print("BBB", planar.shape) # writer.add_image("Dali", planar, k) else: raise NotImplementedError print(inputs.size()) if args.timing: torch.cuda.synchronize() data_end = time.perf_counter() optimizer.zero_grad() im_out = total_iter % args.image_freq == 0 loss = model(Variable(inputs), i, writer, im_out) total_epoch_loss += loss.item() if args.fp16: optimizer.backward(loss) else: loss.backward() optimizer.step() scheduler.step() if args.rank == 0: if args.timing: torch.cuda.synchronize() iter_end = time.perf_counter() sample_timer += (iter_end - iter_start) data_timer += (data_end - iter_start) compute_timer += (iter_end - data_end) torch.cuda.synchronize() iter_start = time.perf_counter() writer.add_scalar('learning_rate', scheduler.get_lr()[0], total_iter) writer.add_scalar('train_loss', loss.item(), total_iter) log.info('Rank %d, Epoch %d, Iteration %d of %d, loss %.5f' % (0, epoch, i + 1, train_batches, loss.item())) total_iter += 1 if args.rank == 0: if args.timing: sample_timer_avg = sample_timer / samples_per_epoch writer.add_scalar('sample_time', sample_timer_avg, total_iter) data_timer_avg = data_timer / samples_per_epoch writer.add_scalar('sample_data_time', data_timer_avg, total_iter) compute_timer_avg = compute_timer / samples_per_epoch writer.add_scalar('sample_compute_time', compute_timer_avg, total_iter) epoch_loss_avg = total_epoch_loss / train_batches log.info('Rank %d, epoch %d: %.5f' % (0, epoch, epoch_loss_avg)) model.eval() total_loss = 0 total_psnr = 0 for i, inputs in enumerate(val_loader): if args.loader == 'NVVL': inputs = inputs['input'] else: inputs = inputs.cuda(non_blocking=True) if args.fp16: inputs = inputs.half() log.info('Validation it %d of %d' % (i + 1, val_batches)) loss, psnr = model(Variable(inputs), i, None) total_loss += loss.item() total_psnr += psnr.item() loss = total_loss / i psnr = total_psnr / i if args.rank == 0: writer.add_scalar('val_loss', loss, total_iter) writer.add_scalar('val_psnr', psnr, total_iter) log.info('Rank %d validation loss %.5f' % (0, loss)) log.info('Rank %d validation psnr %.5f' % (0, psnr))