def memory_checker(opt: FalkonOptions, extra_mem=0): is_cpu = opt.use_cpu mem_check = False if (is_cpu and opt.max_cpu_mem < np.inf) or (not is_cpu and opt.max_gpu_mem < np.inf): mem_check = True start_ram = None if mem_check and not is_cpu: devices = list(range(torch.cuda.device_count())) start_ram = {} for dev in devices: tcd.reset_peak_memory_stats(dev) # We have to work around buggy memory stats: sometimes reset doesn't work as expected. start_ram[dev] = torch.cuda.max_memory_allocated(dev) elif mem_check: start_ram = _cpu_used_mem(uss=True) opt = dataclasses.replace(opt, max_cpu_mem=opt.max_cpu_mem + start_ram) yield opt # Check memory usage if mem_check and not is_cpu: devices = list(range(torch.cuda.device_count())) for dev in devices: used_ram = tcd.max_memory_allocated( dev) - start_ram[dev] - extra_mem assert used_ram <= opt.max_gpu_mem, \ "DEV %d - Memory usage (%.2fMB) exceeds allowed usage (%.2fMB)" % \ (dev, used_ram / 2 ** 20, opt.max_gpu_mem / 2 ** 20) elif mem_check: used_ram = _cpu_used_mem(uss=True) - start_ram - extra_mem assert used_ram <= opt.max_cpu_mem, \ "Memory usage (%.2fMB) exceeds allowed usage (%.2fMB)" % \ (used_ram / 2 ** 20, opt.max_cpu_mem / 2 ** 20)
def get_gpu_statistics(self): id = cuda.current_device() print("Max memory allocated on GPU %d: %d bytes" % (id, cuda.max_memory_allocated(id))) print("Max memory cached on GPU %d: %d bytes" % (id, cuda.max_memory_cached(id))) print("Current memory allocated on GPU %d: %d bytes" % (id, cuda.memory_allocated(id))) print("Current memory cached on GPU %d: %d bytes" % (id, cuda.memory_cached(id)))
def get_memory_cost(fun, *samples): # warm up # fun(*samples) benchmark = torch.backends.cudnn.benchmark torch.backends.cudnn.benchmark = False # conservatively estimate to avoid out of memory in the first calling for i in range(GPU_NUM): cuda.reset_peak_memory_stats(i) max_used_memory_pre = sum( [cuda.max_memory_allocated(i) for i in range(GPU_NUM)]) fun(*samples) max_used_memory_post = sum( [cuda.max_memory_allocated(i) for i in range(GPU_NUM)]) memory_cost = max_used_memory_post - max_used_memory_pre torch.backends.cudnn.benchmark = benchmark return max(memory_cost, 1)
def train(train_loader, model, criterion, optimizer, epoch, args): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() end = time.time() for i, (images, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 500 == 0: progress.display(i) # gpu memory usage # gpu memory usage from torch.cuda import max_memory_allocated, reset_max_memory_allocated print("max gpu memory used = {0}".format(max_memory_allocated())) reset_max_memory_allocated()
def detect(opt): model = opt.model result_path = opt.rp file_list = opt.filelist filepath = opt.filepath if not os.path.exists(result_path): os.makedirs(result_path) devices = [int(item) for item in opt.devices.split(',')] ngpu = len(devices) #net = DispNetC(ngpu, True) #net = DispNetCSRes(ngpu, False, True) #net = DispNetCSResWithMono(ngpu, False, True, input_channel=3) if opt.net == "psmnet" or opt.net == "ganet": net = build_net(opt.net)(maxdisp=192) elif opt.net == "dispnetc": net = build_net(opt.net)(batchNorm=False, lastRelu=True, resBlock=False) else: net = build_net(opt.net)(batchNorm=False, lastRelu=True) net = torch.nn.DataParallel(net, device_ids=devices).cuda() model_data = torch.load(model) print(model_data.keys()) if 'state_dict' in model_data.keys(): net.load_state_dict(model_data['state_dict']) else: net.load_state_dict(model_data) num_of_parameters = count_parameters(net) print('Model: %s, # of parameters: %d' % (opt.net, num_of_parameters)) net.eval() batch_size = int(opt.batchSize) test_dataset = DispDataset(txt_file=file_list, root_dir=filepath, phase='detect') test_loader = DataLoader(test_dataset, batch_size = batch_size, \ shuffle = False, num_workers = 1, \ pin_memory = True) s = time.time() #high_res_EPE = multiscaleloss(scales=1, downscale=1, weights=(1), loss='L1', sparse=False) avg_time = [] display = 100 warmup = 10 for i, sample_batched in enumerate(test_loader): input = torch.cat((sample_batched['img_left'], sample_batched['img_right']), 1) # print('input Shape: {}'.format(input.size())) num_of_samples = input.size(0) target = sample_batched['gt_disp'] #print('disp Shape: {}'.format(target.size())) #original_size = (1, target.size()[2], target.size()[3]) target = target.cuda() input = input.cuda() input_var = torch.autograd.Variable(input, volatile=True) target_var = torch.autograd.Variable(target, volatile=True) if i > warmup: ss = time.time() if opt.net == "psmnet" or opt.net == "ganet": output = net(input_var) elif opt.net == "dispnetc": output = net(input_var)[0] else: output = net(input_var)[-1] if i > warmup: avg_time.append((time.time() - ss)) if (i - warmup) % display == 0: print('Average inference time: %f' % np.mean(avg_time)) mbytes = 1024.*1024 print('GPU memory usage memory_allocated: %d MBytes, max_memory_allocated: %d MBytes, memory_cached: %d MBytes, max_memory_cached: %d MBytes, CPU memory usage: %d MBytes' % \ (ct.memory_allocated()/mbytes, ct.max_memory_allocated()/mbytes, ct.memory_cached()/mbytes, ct.max_memory_cached()/mbytes, process.memory_info().rss/mbytes)) avg_time = [] # output = net(input_var)[1] output[output > 192] = 0 output = scale_disp(output, (output.size()[0], 540, 960)) for j in range(num_of_samples): # scale back depth np_depth = output[j][0].data.cpu().numpy() gt_depth = target_var[j, 0, :, :].data.cpu().numpy() #print(np.min(np_depth), np.max(np_depth)) #cuda_depth = torch.from_numpy(np_depth).cuda() #cuda_depth = torch.autograd.Variable(cuda_depth, volatile=True) # flow2_EPE = high_res_EPE(output[j], target_var[j]) * 1.0 #flow2_EPE = high_res_EPE(cuda_depth, target_var[j]) * 1.0 #print('Shape: {}'.format(output[j].size())) print('Batch[{}]: {}, average disp: {}'.format(i, j, np.mean(np_depth))) #print('Batch[{}]: {}, Flow2_EPE: {}'.format(i, sample_batched['img_names'][0][j], flow2_EPE.data.cpu().numpy())) name_items = sample_batched['img_names'][0][j].split('/') #save_name = '_'.join(name_items).replace('.png', '.pfm')# for girl02 dataset #save_name = 'predict_{}_{}_{}.pfm'.format(name_items[-4], name_items[-3], name_items[-1].split('.')[0]) #save_name = 'predict_{}_{}.pfm'.format(name_items[-1].split('.')[0], name_items[-1].split('.')[1]) #save_name = 'predict_{}.pfm'.format(name_items[-1]) #img = np.flip(np_depth[0], axis=0) save_name = '_'.join(name_items)# for girl02 dataset img = np_depth print('Name: {}'.format(save_name)) print('') #save_pfm('{}/{}'.format(result_path, save_name), img) skimage.io.imsave(os.path.join(result_path, save_name),(img*256).astype('uint16')) save_name = '_'.join(name_items).replace(".png", "_gt.png")# for girl02 dataset img = gt_depth print('Name: {}'.format(save_name)) print('') #save_pfm('{}/{}'.format(result_path, save_name), img) skimage.io.imsave(os.path.join(result_path, save_name),(img*256).astype('uint16')) print('Evaluation time used: {}'.format(time.time()-s))
def main(config): # For fast training. cudnn.benchmark = True # Create directories if not exist. if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) if not os.path.exists(config.model_save_dir): os.makedirs(config.model_save_dir) if not os.path.exists(config.sample_dir): os.makedirs(config.sample_dir) if not os.path.exists(config.result_dir): os.makedirs(config.result_dir) # Data loader. celeba_loader = None rafd_loader = None if config.dataset in ['CelebA', 'Both']: if config.CelebA_data_loader_load_dir == "": celeba_loader = get_loader(config.celeba_image_dir, config.attr_path, config.selected_attrs, config.celeba_crop_size, config.image_size, config.batch_size, 'CelebA', config.mode, config.num_workers) else: with open(config.CelebA_data_loader_load_dir, "rb") as f: celeba_loader = pickle.load(f) if config.dataset in ['RaFD', 'Both']: if config.RaFD_data_loader_load_dir == "": rafd_loader = get_loader(config.rafd_image_dir, None, None, config.rafd_crop_size, config.image_size, config.batch_size, 'RaFD', config.mode, config.num_workers) else: with open(config.RaFD_data_loader_load_dir, "rb") as f: rafd_loader = pickle.load(f) # Solver for training and testing StarGAN. solver = Solver(celeba_loader, rafd_loader, config) if config.CelebA_data_loader_save_dir != "": with open(config.CelebA_data_loader_save_dir, "wb") as f: pickle.dump(celeba_loader, f) if config.RaFD_data_loader_save_dir != "": with open(config.RaFD_data_loader_save_dir, "wb") as f: pickle.dump(rafd_loader, f) print("mem-reserved:", max_memory_reserved()) print("mem-allocated:", max_memory_allocated()) if config.mode == 'train': print("mem-reserved:", max_memory_reserved()) print("mem-allocated:", max_memory_allocated()) if config.dataset in ['CelebA', 'RaFD']: solver.train() elif config.dataset in ['Both']: solver.train_multi() print("mem-reserved:", max_memory_reserved()) print("mem-allocated:", max_memory_allocated()) elif config.mode == 'test': if config.dataset in ['CelebA', 'RaFD']: solver.test() elif config.dataset in ['Both']: solver.test_multi()
def detect(opt): net_name = opt.net model = opt.model result_path = opt.rp file_list = opt.filelist filepath = opt.filepath if not os.path.exists(result_path): os.makedirs(result_path) devices = [int(item) for item in opt.devices.split(',')] ngpu = len(devices) # build net according to the net name if net_name == "psmnet" or net_name == "ganet": net = build_net(net_name)(192) elif net_name in ["fadnet", "dispnetc"]: net = build_net(net_name)(batchNorm=False, lastRelu=True) net = torch.nn.DataParallel(net, device_ids=devices).cuda() model_data = torch.load(model) print(model_data.keys()) if 'state_dict' in model_data.keys(): net.load_state_dict(model_data['state_dict']) else: net.load_state_dict(model_data) num_of_parameters = count_parameters(net) print('Model: %s, # of parameters: %d' % (net_name, num_of_parameters)) net.eval() batch_size = int(opt.batchSize) test_dataset = StereoDataset(txt_file=file_list, root_dir=filepath, phase='detect') test_loader = DataLoader(test_dataset, batch_size = batch_size, \ shuffle = False, num_workers = 1, \ pin_memory = True) s = time.time() avg_time = [] display = 50 warmup = 10 for i, sample_batched in enumerate(test_loader): #if i > 215: # break input = torch.cat( (sample_batched['img_left'], sample_batched['img_right']), 1) # print('input Shape: {}'.format(input.size())) num_of_samples = input.size(0) #output, input_var = detect_batch(net, sample_batched, opt.net, (540, 960)) input = input.cuda() input_var = torch.autograd.Variable(input, volatile=True) if i > warmup: ss = time.time() with torch.no_grad(): if opt.net == "psmnet" or opt.net == "ganet": output = net(input_var) output = output.unsqueeze(1) elif opt.net == "dispnetc": output = net(input_var)[0] else: output = net(input_var)[-1] if i > warmup: avg_time.append((time.time() - ss)) if (i - warmup) % display == 0: print('Average inference time: %f' % np.mean(avg_time)) mbytes = 1024. * 1024 print('GPU memory usage memory_allocated: %d MBytes, max_memory_allocated: %d MBytes, memory_cached: %d MBytes, max_memory_cached: %d MBytes, CPU memory usage: %d MBytes' % \ (ct.memory_allocated()/mbytes, ct.max_memory_allocated()/mbytes, ct.memory_cached()/mbytes, ct.max_memory_cached()/mbytes, process.memory_info().rss/mbytes)) avg_time = [] output = scale_disp(output, (output.size()[0], 540, 960)) disp = output[:, 0, :, :] for j in range(num_of_samples): name_items = sample_batched['img_names'][0][j].split('/') # write disparity to file output_disp = disp[j] np_disp = disp[j].data.cpu().numpy() print('Batch[{}]: {}, average disp: {}({}-{}).'.format( i, j, np.mean(np_disp), np.min(np_disp), np.max(np_disp))) save_name = '_'.join(name_items).replace( ".png", "_d.png") # for girl02 dataset print('Name: {}'.format(save_name)) skimage.io.imsave(os.path.join(result_path, save_name), (np_disp * 256).astype('uint16')) #save_name = '_'.join(name_items).replace("png", "pfm")# for girl02 dataset #print('Name: {}'.format(save_name)) #np_disp = np.flip(np_disp, axis=0) #save_pfm('{}/{}'.format(result_path, save_name), np_disp) print('Evaluation time used: {}'.format(time.time() - s))
def get_memory_use(): device = cuda.current_device() message = cuda.get_device_name(device) + ':\n' message += 'allocated:' + str(cuda.memory_allocated(device)) + '/' + str(cuda.max_memory_allocated()) + '\n' message += 'cached:' + str(cuda.memory_cached(device)) + '/' + str(cuda.max_memory_cached()) + '\n' return message
' Batch Acc Loss Prec EPS Fwd EPS Back Alloc Cached %ModelAlloc %ModelCached' ) else: # Calc precision of recovery if not autograd: d = (Y - images).norm() / images.norm() else: d = torch.tensor([0]) print( ' %6d, %6.4f, %6.4f, %6.4e, %6.1f, %6.1f, %6.3f, %6.3f, %6.3f, %6.3f' % (i, np.mean(acc), loss.item(), d.item(), np.mean(eps_fwd), np.mean(eps_back), byte2mb(cuda.max_memory_allocated()), byte2mb(cuda.max_memory_cached()), model_size(net) / byte2mb(cuda.max_memory_allocated()), model_size(net) / byte2mb(cuda.max_memory_cached()))) acc = [] start_time = time.time() eps_back = [] eps_fwd = [] # cuda.reset_max_memory_allocated() # cuda.reset_max_memory_cached() # Val set acc = [] with torch.no_grad(): for i, (images, labels) in enumerate(val_loader):
def detect(opt): net_name = opt.net model = opt.model result_path = opt.rp file_list = opt.filelist filepath = opt.filepath if not os.path.exists(result_path): os.makedirs(result_path) devices = [int(item) for item in opt.devices.split(',')] ngpu = len(devices) # build net according to the net name if net_name in ["dispnetcres", "dispnetc"]: net = build_net(net_name)(batchNorm=False, lastRelu=True) else: net = build_net(net_name)(batchNorm=False, lastRelu=True) net.set_focal_length(1050.0, 1050.0) net = torch.nn.DataParallel(net, device_ids=devices).cuda() #net.cuda() model_data = torch.load(model) print(model_data.keys()) if 'state_dict' in model_data.keys(): net.load_state_dict(model_data['state_dict']) else: net.load_state_dict(model_data) num_of_parameters = count_parameters(net) print('Model: %s, # of parameters: %d' % (net_name, num_of_parameters)) net.eval() batch_size = int(opt.batchSize) #test_dataset = StereoDataset(txt_file=file_list, root_dir=filepath, phase='detect') test_dataset = SceneFlowDataset(txt_file=file_list, root_dir=filepath, phase='detect') test_loader = DataLoader(test_dataset, batch_size = batch_size, \ shuffle = False, num_workers = 1, \ pin_memory = True) s = time.time() #high_res_EPE = multiscaleloss(scales=1, downscale=1, weights=(1), loss='L1', sparse=False) avg_time = [] display = 100 warmup = 10 for i, sample_batched in enumerate(test_loader): input = torch.cat( (sample_batched['img_left'], sample_batched['img_right']), 1) if opt.disp_on: target_disp = sample_batched['gt_disp'] target_disp = target_disp.cuda() if opt.norm_on: target_norm = sample_batched['gt_norm'] target_norm = target_norm.cuda() # print('input Shape: {}'.format(input.size())) num_of_samples = input.size(0) #output, input_var = detect_batch(net, sample_batched, opt.net, (540, 960)) input = input.cuda() input_var = torch.autograd.Variable(input, volatile=True) if i > warmup: ss = time.time() if opt.net == "psmnet" or opt.net == "ganet": output = net(input_var) elif opt.net == "dispnetc": output = net(input_var)[0] elif opt.net in ["dispnormnet", "dtonnet", "dnfusionnet"]: output = net(input_var) disp = output[0] normal = output[1] output = torch.cat((normal, disp), 1) else: output = net(input_var)[-1] if i > warmup: avg_time.append((time.time() - ss)) if (i - warmup) % display == 0: print('Average inference time: %f' % np.mean(avg_time)) mbytes = 1024. * 1024 print('GPU memory usage memory_allocated: %d MBytes, max_memory_allocated: %d MBytes, memory_cached: %d MBytes, max_memory_cached: %d MBytes, CPU memory usage: %d MBytes' % \ (ct.memory_allocated()/mbytes, ct.max_memory_allocated()/mbytes, ct.memory_cached()/mbytes, ct.max_memory_cached()/mbytes, process.memory_info().rss/mbytes)) avg_time = [] # output = net(input_var)[1] if opt.disp_on and not opt.norm_on: output = scale_disp(output, (output.size()[0], 540, 960)) disp = output[:, 0, :, :] elif opt.disp_on and opt.norm_on: output = scale_norm(output, (output.size()[0], 4, 540, 960)) disp = output[:, 3, :, :] normal = output[:, :3, :, :] print('disp shape:', disp.shape) for j in range(num_of_samples): name_items = sample_batched['img_names'][0][j].split('/') # write disparity to file if opt.disp_on: output_disp = disp[j] _target_disp = target_disp[j, 0] target_valid = _target_disp < 192 print('target size', _target_disp.size()) print('output size', output_disp.size()) epe = F.smooth_l1_loss(output_disp[target_valid], _target_disp[target_valid], size_average=True) print('EPE: {}'.format(epe)) np_disp = disp[j].data.cpu().numpy() print('Batch[{}]: {}, average disp: {}({}-{}).'.format( i, j, np.mean(np_disp), np.min(np_disp), np.max(np_disp))) save_name = '_'.join(name_items).replace(".png", "_d.png") print('Name: {}'.format(save_name)) skimage.io.imsave(os.path.join(result_path, save_name), (np_disp * 256).astype('uint16')) #save_name = '_'.join(name_items).replace(".png", "_d.pfm") #print('Name: {}'.format(save_name)) #np_disp = np.flip(np_disp, axis=0) #save_pfm('{}/{}'.format(result_path, save_name), np_disp) if opt.norm_on: normal[j] = (normal[j] + 1.0) * 0.5 #np_normal = normal[j].data.cpu().numpy().transpose([1, 2, 0]) np_normal = normal[j].data.cpu().numpy() #save_name = '_'.join(name_items).replace('.png', '_n.png') save_name = '_'.join(name_items).replace('.png', '_n.exr') print('Name: {}'.format(save_name)) #skimage.io.imsave(os.path.join(result_path, save_name),(normal*256).astype('uint16')) #save_pfm('{}/{}'.format(result_path, save_name), img) save_exr(np_normal, '{}/{}'.format(result_path, save_name)) print('') #save_name = '_'.join(name_items).replace(".png", "_left.png") #img = input_var[0].detach().cpu().numpy()[:3,:,:] #img = np.transpose(img, (1, 2, 0)) #print('Name: {}'.format(save_name)) #print('') ##save_pfm('{}/{}'.format(result_path, save_name), img) #skimage.io.imsave(os.path.join(result_path, save_name),img) print('Evaluation time used: {}'.format(time.time() - s))
def train_multi(self): print("mem-reserved:",max_memory_reserved()) print("mem-allocated:",max_memory_allocated()) """Train StarGAN with multiple datasets.""" # Data iterators. celeba_iter = iter(self.celeba_loader) rafd_iter = iter(self.rafd_loader) # Fetch fixed inputs for debugging. x_fixed, c_org = next(celeba_iter) x_fixed = x_fixed.to(self.device) c_celeba_list = self.create_labels(c_org, self.c_dim, 'CelebA', self.selected_attrs) c_rafd_list = self.create_labels(c_org, self.c2_dim, 'RaFD') zero_celeba = torch.zeros(x_fixed.size(0), self.c_dim).to(self.device) # Zero vector for CelebA. zero_rafd = torch.zeros(x_fixed.size(0), self.c2_dim).to(self.device) # Zero vector for RaFD. mask_celeba = self.label2onehot(torch.zeros(x_fixed.size(0)), 2).to(self.device) # Mask vector: [1, 0]. mask_rafd = self.label2onehot(torch.ones(x_fixed.size(0)), 2).to(self.device) # Mask vector: [0, 1]. # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr # Start training from scratch or resume training. start_iters = 0 if self.resume_iters: start_iters = self.resume_iters self.restore_model(self.resume_iters) # Start training. print('Start training...') start_time = time.time() for i in range(start_iters, self.num_iters): for dataset in ['CelebA', 'RaFD']: # =================================================================================== # # 1. Preprocess input data # # =================================================================================== # # Fetch real images and labels. data_iter = celeba_iter if dataset == 'CelebA' else rafd_iter try: x_real, label_org = next(data_iter) except: if dataset == 'CelebA': celeba_iter = iter(self.celeba_loader) x_real, label_org = next(celeba_iter) elif dataset == 'RaFD': rafd_iter = iter(self.rafd_loader) x_real, label_org = next(rafd_iter) # Generate target domain labels randomly. rand_idx = torch.randperm(label_org.size(0)) label_trg = label_org[rand_idx] if dataset == 'CelebA': c_org = label_org.clone() c_trg = label_trg.clone() zero = torch.zeros(x_real.size(0), self.c2_dim) mask = self.label2onehot(torch.zeros(x_real.size(0)), 2) c_org = torch.cat([c_org, zero, mask], dim=1) c_trg = torch.cat([c_trg, zero, mask], dim=1) elif dataset == 'RaFD': c_org = self.label2onehot(label_org, self.c2_dim) c_trg = self.label2onehot(label_trg, self.c2_dim) zero = torch.zeros(x_real.size(0), self.c_dim) mask = self.label2onehot(torch.ones(x_real.size(0)), 2) c_org = torch.cat([zero, c_org, mask], dim=1) c_trg = torch.cat([zero, c_trg, mask], dim=1) x_real = x_real.to(self.device) # Input images. c_org = c_org.to(self.device) # Original domain labels. c_trg = c_trg.to(self.device) # Target domain labels. label_org = label_org.to(self.device) # Labels for computing classification loss. label_trg = label_trg.to(self.device) # Labels for computing classification loss. # =================================================================================== # # 2. Train the discriminator # # =================================================================================== # # Compute loss with real images. out_src, out_cls = self.D(x_real) out_cls = out_cls[:, :self.c_dim] if dataset == 'CelebA' else out_cls[:, self.c_dim:] d_loss_real = - torch.mean(out_src) d_loss_cls = self.classification_loss(out_cls, label_org, dataset) # Compute loss with fake images. x_fake = self.G(x_real, c_trg) out_src, _ = self.D(x_fake.detach()) d_loss_fake = torch.mean(out_src) # Compute loss for gradient penalty. alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = (alpha * x_real.data + (1 - alpha) * x_fake.data).requires_grad_(True) out_src, _ = self.D(x_hat) d_loss_gp = self.gradient_penalty(out_src, x_hat) # Backward and optimize. d_loss = d_loss_real + d_loss_fake + self.lambda_cls * d_loss_cls + self.lambda_gp * d_loss_gp self.reset_grad() d_loss.backward() self.d_optimizer.step() # Logging. loss = {} loss['D/loss_real'] = d_loss_real.item() loss['D/loss_fake'] = d_loss_fake.item() loss['D/loss_cls'] = d_loss_cls.item() loss['D/loss_gp'] = d_loss_gp.item() # =================================================================================== # # 3. Train the generator # # =================================================================================== # if (i+1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, c_trg) out_src, out_cls = self.D(x_fake) out_cls = out_cls[:, :self.c_dim] if dataset == 'CelebA' else out_cls[:, self.c_dim:] g_loss_fake = - torch.mean(out_src) g_loss_cls = self.classification_loss(out_cls, label_trg, dataset) # Target-to-original domain. x_reconst = self.G(x_fake, c_org) g_loss_rec = torch.mean(torch.abs(x_real - x_reconst)) # Backward and optimize. g_loss = g_loss_fake + self.lambda_rec * g_loss_rec + self.lambda_cls * g_loss_cls self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss['G/loss_fake'] = g_loss_fake.item() loss['G/loss_rec'] = g_loss_rec.item() loss['G/loss_cls'] = g_loss_cls.item() # =================================================================================== # # 4. Miscellaneous # # =================================================================================== # # Print out training info. if (i+1) % self.log_step == 0: et = time.time() - start_time et = str(datetime.timedelta(seconds=et))[:-7] log = "Elapsed [{}], Iteration [{}/{}], Dataset [{}]".format(et, i+1, self.num_iters, dataset) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) if self.use_tensorboard: for tag, value in loss.items(): self.logger.scalar_summary(tag, value, i+1) # Translate fixed images for debugging. if (i+1) % self.sample_step == 0: with torch.no_grad(): x_fake_list = [x_fixed] for c_fixed in c_celeba_list: c_trg = torch.cat([c_fixed, zero_rafd, mask_celeba], dim=1) x_fake_list.append(self.G(x_fixed, c_trg)) for c_fixed in c_rafd_list: c_trg = torch.cat([zero_celeba, c_fixed, mask_rafd], dim=1) x_fake_list.append(self.G(x_fixed, c_trg)) x_concat = torch.cat(x_fake_list, dim=3) sample_path = os.path.join(self.sample_dir, '{}-images.jpg'.format(i+1)) save_image(self.denorm(x_concat.data.cpu()), sample_path, nrow=1, padding=0) print('Saved real and fake images into {}...'.format(sample_path)) # Save model checkpoints. if (i+1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i+1)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i+1)) torch.save(self.G.state_dict(), G_path) torch.save(self.D.state_dict(), D_path) print('Saved model checkpoints into {}...'.format(self.model_save_dir)) # Decay learning rates. if (i+1) % self.lr_update_step == 0 and (i+1) > (self.num_iters - self.num_iters_decay): g_lr -= (self.g_lr / float(self.num_iters_decay)) d_lr -= (self.d_lr / float(self.num_iters_decay)) self.update_lr(g_lr, d_lr) print ('Decayed learning rates, g_lr: {}, d_lr: {}.'.format(g_lr, d_lr)) print("mem-reserved:",max_memory_reserved()) print("mem-allocated:",max_memory_allocated())
def train(self, num_of_iters=1, data=None, hidden=None): self.loss = 0.0 s = time.time() for i in range(num_of_iters): self.adjust_learning_rate(self.train_epoch, self.optimizer) if self.train_iter % self.num_batches_per_epoch == 0 and self.train_iter > 0: logger.info('train iter: %d, num_batches_per_epoch: %d', self.train_iter, self.num_batches_per_epoch) logger.info( 'Epoch %d, avg train acc: %f, lr: %f, avg loss: %f' % (self.train_iter // self.num_batches_per_epoch, np.mean(self.train_acc_top1), self.lr, self.avg_loss_per_epoch / self.num_batches_per_epoch)) mean_s = np.mean(self.sparsities) if self.train_iter > 0 and np.isnan(mean_s): logger.warn('NaN detected! sparsities: %s' % self.sparsities) logger.info( 'Average Sparsity: %f, compression ratio: %f, communication size: %f', np.mean(self.sparsities), np.mean(self.compression_ratios), np.mean(self.communication_sizes)) if self.rank == 0 and self.writer is not None: self.writer.add_scalar( 'cross_entropy', self.avg_loss_per_epoch / self.num_batches_per_epoch, self.train_epoch) self.writer.add_scalar('top-1 acc', np.mean(self.train_acc_top1), self.train_epoch) if self.rank == 0: self.test(self.train_epoch) self.sparsities = [] self.compression_ratios = [] self.communication_sizes = [] self.train_acc_top1 = [] self.epochs_info.append(self.avg_loss_per_epoch / self.num_batches_per_epoch) self.avg_loss_per_epoch = 0.0 if self.train_iter > 0 and self.rank == 0: state = { 'iter': self.train_iter, 'epoch': self.train_epoch, 'state': self.get_model_state() } if self.prefix: relative_path = './weights/%s/%s-n%d-bs%d-lr%.4f' % ( self.prefix, self.dnn, self.nworkers, self.batch_size, self.base_lr) else: relative_path = './weights/%s-n%d-bs%d-lr%.4f' % ( self.dnn, self.nworkers, self.batch_size, self.base_lr) if settings.SPARSE: relative_path += '-s%.5f' % self.sparsity utils.create_path(relative_path) filename = '%s-rank%d-epoch%d.pth' % (self.dnn, self.rank, self.train_epoch) fn = os.path.join(relative_path, filename) #self.save_checkpoint(state, fn) #self.remove_dict(state) self.train_epoch += 1 if self.train_sampler and (self.nworkers > 1): self.train_sampler.set_epoch(self.train_epoch) ss = time.time() if data is None: data = self.data_iter() if self.dataset == 'an4': inputs, labels_cpu, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() else: inputs, labels_cpu = data if self.is_cuda: if self.dnn == 'lstm': inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda() labels = Variable(labels_cpu.transpose( 0, 1).contiguous()).cuda() else: inputs, labels = inputs.cuda( non_blocking=True), labels_cpu.cuda(non_blocking=True) else: labels = labels_cpu self.iotime += (time.time() - ss) if self.dnn == 'lstman4': out, output_sizes = self.net(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH loss = self.criterion(out, labels_cpu, output_sizes, target_sizes) loss = loss / inputs.size(0) # average the loss by minibatch loss.backward() elif self.dnn == 'lstm': hidden = lstmpy.repackage_hidden(hidden) outputs, hidden = self.net(inputs, hidden) tt = torch.squeeze( labels.view(-1, self.net.batch_size * self.net.num_steps)) loss = self.criterion(outputs.view(-1, self.net.vocab_size), tt) loss.backward() else: # forward + backward + optimize outputs = self.net(inputs) loss = self.criterion(outputs, labels) loss.backward() loss_value = loss.item() # logger.info statistics self.loss += loss_value self.avg_loss_per_epoch += loss_value if self.dnn not in ['lstm', 'lstman4']: acc1, = self.cal_accuracy(outputs, labels, topk=(1, )) self.train_acc_top1.append(acc1) self.train_iter += 1 self.num_of_updates_during_comm += 1 self.loss /= num_of_iters self.timer += time.time() - s display = 100 if self.train_iter % display == 0: logger.info( '[%3d][%5d/%5d][rank:%d] loss: %.3f, average forward and backward time: %f, iotime: %f ' % (self.train_epoch, self.train_iter, self.num_batches_per_epoch, self.rank, self.loss, self.timer / display, self.iotime / display)) mbytes = 1024. * 1024 logger.info( 'GPU memory usage memory_allocated: %d MBytes, max_memory_allocated: %d MBytes, memory_cached: %d MBytes, max_memory_cached: %d MBytes, CPU memory usage: %d MBytes', ct.memory_allocated() / mbytes, ct.max_memory_allocated() / mbytes, ct.memory_cached() / mbytes, ct.max_memory_cached() / mbytes, process.memory_info().rss / mbytes) self.timer = 0.0 self.iotime = 0.0 if self.is_cuda: torch.cuda.empty_cache() if self.dnn == 'lstm': return num_of_iters, hidden return num_of_iters
def detect(opt): net_name = opt.net model = opt.model result_path = opt.rp file_list = opt.filelist filepath = opt.filepath if not os.path.exists(result_path): os.makedirs(result_path) devices = [int(item) for item in opt.devices.split(',')] ngpu = len(devices) # build net according to the net name if net_name == "psmnet" or net_name == "ganet": net = build_net(net_name)(192) elif net_name in ["fadnet", "dispnetc", "mobilefadnet", "slightfadnet"]: net = build_net(net_name)(batchNorm=False, lastRelu=True) #elif net_name in ["mobilefadnet", "slightfadnet"]: # #B, max_disp, H, W = (wopt.batchSize, 40, 72, 120) # shape = (opt.batchSize, 40, 72, 120) #TODO: Should consider how to dynamically use # warp_size = (opt.batchSize, 3, 576, 960) # net = build_net(net_name)(batchNorm=False, lastRelu=True, input_img_shape=shape, warp_size=warp_size) if ngpu > 1: net = torch.nn.DataParallel(net, device_ids=devices) model_data = torch.load(model) print(model_data.keys()) if 'state_dict' in model_data.keys(): #net.load_state_dict(model_data['state_dict']) load_model_trained_with_DP(net, model_data['state_dict']) else: net.load_state_dict(model_data) num_of_parameters = count_parameters(net) print('Model: %s, # of parameters: %d' % (net_name, num_of_parameters)) batch_size = int(opt.batchSize) test_dataset = StereoDataset(txt_file=file_list, root_dir=filepath, phase='detect') test_loader = DataLoader(test_dataset, batch_size = batch_size, \ shuffle = False, num_workers = 1, \ pin_memory = True) net.eval() #net.dispnetc.eval() #net.dispnetres.eval() net = net.cuda() #for i, sample_batched in enumerate(test_loader): # input = torch.cat((sample_batched['img_left'], sample_batched['img_right']), 1) # num_of_samples = input.size(0) # input = input.cuda() # x = input # break net_trt = trt_transform(net) torch.save(net_trt.state_dict(), 'models/mobilefadnet_trt.pth') s = time.time() avg_time = [] display = 50 warmup = 2 for i, sample_batched in enumerate(test_loader): #if i > 215: # break stime = time.time() input = torch.cat( (sample_batched['img_left'], sample_batched['img_right']), 1) print('input Shape: {}'.format(input.size())) num_of_samples = input.size(0) input = input.cuda() break iterations = 14 + warmup #iterations = len(test_loader) - warmup #for i, sample_batched in enumerate(test_loader): for i in range(iterations): stime = time.time() input = torch.cat( (sample_batched['img_left'], sample_batched['img_right']), 1) print('input Shape: {}'.format(input.size())) num_of_samples = input.size(0) input = input.cuda() input_var = input #torch.autograd.Variable(input, volatile=True) iotime = time.time() print('[{}] IO time:{}'.format(i, iotime - stime)) if i == warmup: ss = time.time() with torch.no_grad(): if opt.net == "psmnet" or opt.net == "ganet": output = net_trt(input_var) output = output.unsqueeze(1) elif opt.net == "dispnetc": output = net_trt(input_var)[0] else: output = net_trt(input_var)[-1] itime = time.time() print('[{}] Inference time:{}'.format(i, itime - iotime)) if i > warmup: avg_time.append((time.time() - ss)) if (i - warmup) % display == 0: print('Average inference time: %f' % np.mean(avg_time)) mbytes = 1024. * 1024 print('GPU memory usage memory_allocated: %d MBytes, max_memory_allocated: %d MBytes, memory_cached: %d MBytes, max_memory_cached: %d MBytes, CPU memory usage: %d MBytes' % \ (ct.memory_allocated()/mbytes, ct.max_memory_allocated()/mbytes, ct.memory_cached()/mbytes, ct.max_memory_cached()/mbytes, process.memory_info().rss/mbytes)) avg_time = [] print('[%d] output shape:' % i, output.size()) #output = scale_disp(output, (output.size()[0], 540, 960)) #disp = output[:, 0, :, :] ptime = time.time() print('[{}] Post-processing time:{}'.format(i, ptime - itime)) #for j in range(num_of_samples): # name_items = sample_batched['img_names'][0][j].split('/') # # write disparity to file # output_disp = disp[j] # np_disp = disp[j].float().cpu().numpy() # print('Batch[{}]: {}, average disp: {}({}-{}).'.format(i, j, np.mean(np_disp), np.min(np_disp), np.max(np_disp))) # save_name = '_'.join(name_items).replace(".png", "_d.png")# for girl02 dataset # print('Name: {}'.format(save_name)) # skimage.io.imsave(os.path.join(result_path, save_name),(np_disp*256).astype('uint16')) print('Current batch time used:: {}'.format(time.time() - stime)) #save_name = '_'.join(name_items).replace("png", "pfm")# for girl02 dataset #print('Name: {}'.format(save_name)) #np_disp = np.flip(np_disp, axis=0) #save_pfm('{}/{}'.format(result_path, save_name), np_disp) print('Evaluation time used: {}, avg iter: {}'.format( time.time() - ss, (time.time() - ss) / iterations))
# - torch.cuda.get_device_capability(device): 返回设备的cuda能力 #%% cuda.get_device_capability(0) #%% [markdown] # - torch.cuda.get_device_name(device):返回设备名称 #%% cuda.get_device_name(0) #%% [markdown] # - torch.cuda.max_memory_allocated(device):返回指定设备张量的最大GPU内存用量 #%% cuda.max_memory_allocated(0) #%% device = torch.device('cuda') if cuda.is_available() else torch.device('cpu') X = torch.randn(100, 100, device=device) X.shape #%% cuda.max_memory_allocated(0) #%% [markdown] # - torch.cuda.max_memory_cached(device=None):返回指定设备缓存分配器管理的最大GPU内存 #%% cuda.max_memory_cached(0)