def __init__(self, checkpoint_dir, model, optimizer, cfg): self.module_dict_params = { f"{cfg['method']}_model": model, f"optimizer": optimizer, f"{cfg['method']}_config": cfg['model'], } self.checkpoint_dir = checkpoint_dir utils.cond_mkdir(checkpoint_dir)
def train(validation=True): root_path = os.path.join(opt.logging_root, opt.experiment_name) utils.cond_mkdir(root_path) fn = dataio.polynomial_1 integral_fn = dataio.polynomial_1_integral train_dataset = dataio.Implicit1DWrapper(range=(-1, 2), fn=fn, integral_fn=integral_fn, sampling_density=1000, train_every=250) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=opt.batch_size, pin_memory=True, num_workers=0) if opt.activation != 'sine': num_pe_functions = 4 # cos + sin else: num_pe_functions = 0 model = modules.CoordinateNet(nl=opt.activation, in_features=1, out_features=1, hidden_features=opt.hidden_features, num_hidden_layers=opt.hidden_layers, w0=opt.w0, use_grad=True, num_pe_fns=num_pe_functions, input_processing_fn=lambda x: x, grad_var='coords') model.cuda() # Define the loss loss_fn = loss_functions.function_mse summary_fn = partial(utils.write_simple_1D_function_summary, train_dataset) # Save command-line parameters log directory. p.write_config_file(opt, [os.path.join(root_path, 'config.ini')]) with open(os.path.join(root_path, "params.txt"), "w") as out_file: out_file.write('\n'.join( ["%s: %s" % (key, value) for key, value in vars(opt).items()])) # Save text summary of model into log directory. with open(os.path.join(root_path, "model.txt"), "w") as out_file: out_file.write(str(model)) training.train(model=model, train_dataloader=train_dataloader, epochs=opt.num_epochs, lr=opt.lr, steps_til_summary=opt.steps_til_summary, epochs_til_checkpoint=opt.epochs_til_ckpt, model_dir=root_path, loss_fn=loss_fn, summary_fn=summary_fn)
def getTestMSE(dataloader, subdir): MSEs = [] total_steps = 0 utils.cond_mkdir(os.path.join(root_path, subdir)) utils.cond_mkdir(os.path.join(root_path, 'ground_truth')) with tqdm(total=len(dataloader)) as pbar: for step, (model_input, gt) in enumerate(dataloader): model_input['idx'] = torch.Tensor([model_input['idx']]).long() model_input = { key: value.cuda() for key, value in model_input.items() } gt = {key: value.cuda() for key, value in gt.items()} with torch.no_grad(): model_output = model(model_input) out_img = dataio.lin2img(model_output['model_out'], image_resolution).squeeze().permute( 1, 2, 0).detach().cpu().numpy() out_img += 1 out_img /= 2. out_img = np.clip(out_img, 0., 1.) gt_img = dataio.lin2img(gt['img'], image_resolution).squeeze().permute( 1, 2, 0).detach().cpu().numpy() gt_img += 1 gt_img /= 2. gt_img = np.clip(gt_img, 0., 1.) sparse_img = model_input['img_sparse'].squeeze().detach().cpu( ).permute(1, 2, 0).numpy() mask = np.sum((sparse_img == 0), axis=2) == 3 sparse_img += 1 sparse_img /= 2. sparse_img = np.clip(sparse_img, 0., 1.) sparse_img[mask, ...] = 1. imageio.imwrite( os.path.join(root_path, subdir, str(total_steps) + '_sparse.png'), to_uint8(sparse_img)) imageio.imwrite( os.path.join(root_path, subdir, str(total_steps) + '.png'), to_uint8(out_img)) imageio.imwrite( os.path.join(root_path, 'ground_truth', str(total_steps) + '.png'), to_uint8(gt_img)) MSE = np.mean((out_img - gt_img)**2) MSEs.append(MSE) pbar.update(1) total_steps += 1 return MSEs
def main(cfg, num_workers): # Shortened out_dir = cfg['training']['out_dir'] batch_size = cfg['training']['batch_size'] utils.save_config(os.path.join(out_dir, 'config.yml'), cfg) model_selection_metric = cfg['training']['model_selection_metric'] model_selection_sign = 1 if cfg['training'][ 'model_selection_mode'] == 'maximize' else -1 # Output directory utils.cond_mkdir(out_dir) # Dataset test_dataset = config.get_dataset('test', cfg) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) # Model model = config.get_model(cfg) trainer = config.get_trainer(model, None, cfg) # Print model print(model) logger = logging.getLogger(__name__) logger.info( f'Total number of parameters: {sum(p.numel() for p in model.parameters())}' ) ckp = checkpoints.CheckpointIO(out_dir, model, None, cfg) try: load_dict = ckp.load('model_best.pt') logger.info('Model loaded') except FileExistsError: logger.info('Model NOT loaded') load_dict = dict() metric_val_best = load_dict.get('loss_val_best', -model_selection_sign * np.inf) logger.info( f'Current best validation metric ({model_selection_metric}): {metric_val_best:.6f}' ) eval_dict = trainer.evaluate(test_loader) metric_val = eval_dict[model_selection_metric] logger.info( f'Validation metric ({model_selection_metric}): {metric_val:.8f}') eval_dict_path = os.path.join(out_dir, 'eval_dict.yml') with open(eval_dict_path, 'w') as f: yaml.dump(config, f) print(f'Results saved in {eval_dict_path}')
def plot(stats_list): plt.figure() avg_reward = np.array([stats[0] for stats in stats_list]) std_reward = np.array([stats[1] for stats in stats_list]) num_wins = np.array([stats[2] for stats in stats_list]) num_loss = np.array([stats[3] for stats in stats_list]) episode = np.arange(1, len(avg_reward)+1) plt.plot(episode, avg_reward) reward_upper = avg_reward + std_reward reward_lower = avg_reward - std_reward plt.fill_between(episode, reward_lower, reward_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') utils.cond_mkdir('./plots/') plt.savefig('./plots/plot')
featExNets = models.featExtractionNets() upSamplingNets = models.upSamplingNets() refineNets = models.refineNets() if torch.cuda.is_available(): featExNets = featExNets.cuda() upSamplingNets = upSamplingNets.cuda() refineNets = refineNets.cuda() # Create Optimizer opt_feature = torch.optim.Adam(featExNets.parameters(), lr=lr) opt_upSampling = torch.optim.Adam(upSamplingNets.parameters(), lr=lr) opt_refine = torch.optim.Adam(refineNets.parameters(), lr=lr) # Create Logging dir utils.cond_mkdir(opt.logging_root + '/kmeans') utils.cond_mkdir(opt.logging_root + '/models') # Save command-line parameters to log directory. with open(opt.logging_root + '/params.txt', "w") as out_file: out_file.write('\n'.join( ["%s: %s" % (key, value) for key, value in vars(opt).items()])) # Start Training ori_psnr = 0 for epoch in range(max_epochs): utils.adjust_learning_rate(opt_feature, epoch, lr) utils.adjust_learning_rate(opt_upSampling, epoch, lr) utils.adjust_learning_rate(opt_refine, epoch, lr) avg_err, avg_psnr = 0, 0
def main(cfg, num_workers): # Shortened out_dir = cfg['training']['out_dir'] batch_size = cfg['training']['batch_size'] backup_every = cfg['training']['backup_every'] utils.save_config(os.path.join(out_dir, 'config.yml'), cfg) model_selection_metric = cfg['training']['model_selection_metric'] model_selection_sign = 1 if cfg['training'][ 'model_selection_mode'] == 'maximize' else -1 # Output directory utils.cond_mkdir(out_dir) # Dataset train_dataset = config.get_dataset('train', cfg) val_dataset = config.get_dataset('val', cfg) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) # Model model = config.get_model(cfg) optimizer = optim.Adam(model.parameters(), lr=1e-4) trainer = config.get_trainer(model, optimizer, cfg) # Print model print(model) logger = logging.getLogger(__name__) logger.info( f'Total number of parameters: {sum(p.numel() for p in model.parameters())}' ) # load pretrained model tb_logger = tensorboardX.SummaryWriter(os.path.join(out_dir, 'logs')) ckp = checkpoints.CheckpointIO(out_dir, model, optimizer, cfg) try: load_dict = ckp.load('model_best.pt') logger.info('Model loaded') except FileExistsError: logger.info('Model NOT loaded') load_dict = dict() epoch_it = load_dict.get('epoch_it', -1) it = load_dict.get('it', -1) metric_val_best = load_dict.get('loss_val_best', -model_selection_sign * np.inf) logger.info( f'Current best validation metric ({model_selection_metric}): {metric_val_best:.6f}' ) # Shortened print_every = cfg['training']['print_every'] validate_every = cfg['training']['validate_every'] max_iterations = cfg['training']['max_iterations'] max_epochs = cfg['training']['max_epochs'] while True: epoch_it += 1 for batch in train_loader: it += 1 loss_dict = trainer.train_step(batch) loss = loss_dict['total_loss'] for k, v in loss_dict.items(): tb_logger.add_scalar(f'train/{k}', v, it) # Print output if print_every > 0 and (it % print_every) == 0: logger.info( f'[Epoch {epoch_it:02d}] it={it:03d}, loss={loss:.8f}') # Backup if necessary if backup_every > 0 and (it % backup_every) == 0: logger.info('Backup checkpoint') ckp.save(f'model_{it:d}.pt', epoch_it=epoch_it, it=it, loss_val_best=metric_val_best) # Run validation if validate_every > 0 and (it % validate_every) == 0: eval_dict = trainer.evaluate(val_loader) print('eval_dict=\n', eval_dict) metric_val = eval_dict[model_selection_metric] logger.info( f'Validation metric ({model_selection_metric}): {metric_val:.8f}' ) for k, v in eval_dict.items(): tb_logger.add_scalar(f'val/{k}', v, it) if model_selection_sign * (metric_val - metric_val_best) > 0: metric_val_best = metric_val logger.info(f'New best model (loss {metric_val_best:.8f}') ckp.save('model_best.pt', epoch_it=epoch_it, it=it, loss_val_best=metric_val_best) if (0 < max_iterations <= it) or (0 < max_epochs <= epoch_it): logger.info( f'Maximum iteration/epochs ({epoch_it}/{it}) reached. Exiting.' ) ckp.save(f'model_{it:d}.pt', epoch_it=epoch_it, it=it, loss_val_best=metric_val_best) exit(3)
def train_wchunks(models, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_fn, summary_fn, chunk_lists_from_batch_fn, val_dataloader=None, double_precision=False, clip_grad=False, loss_schedules=None, num_cuts=128, weight_decay=0.0, max_chunk_size=4096, loss_start={}, resume_checkpoint={}): optims = { key: torch.optim.Adam(lr=lr, params=model.parameters()) for key, model in models.items() } schedulers = { key: torch.optim.lr_scheduler.StepLR(optim, step_size=8000, gamma=0.2) for key, optim in optims.items() } # load optimizer if supplied for key in models.keys(): if key in resume_checkpoint: optims[key].load_state_dict(resume_checkpoint[key]) schedulers = { key: torch.optim.lr_scheduler.StepLR(optim, step_size=8000, gamma=0.2) for key, optim in optims.items() } if os.path.exists(os.path.join(model_dir, 'summaries')): val = input("The model directory %s exists. Overwrite? (y/n)" % model_dir) if val == 'y': if os.path.exists(os.path.join(model_dir, 'summaries')): shutil.rmtree(os.path.join(model_dir, 'summaries')) if os.path.exists(os.path.join(model_dir, 'checkpoints')): shutil.rmtree(os.path.join(model_dir, 'checkpoints')) os.makedirs(model_dir, exist_ok=True) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 if 'total_steps' in resume_checkpoint: total_steps = resume_checkpoint['total_steps'] start_epoch = 0 if 'epoch' in resume_checkpoint: start_epoch = resume_checkpoint['epoch'] for scheduler in schedulers.values(): for i in range(start_epoch): scheduler.step() with tqdm(total=len(train_dataloader) * epochs) as pbar: pbar.update(total_steps) train_losses = [] for epoch in range(start_epoch, epochs): if not epoch % epochs_til_checkpoint and epoch: for key, model in models.items(): torch.save( model.state_dict(), os.path.join( checkpoints_dir, 'model_' + key + '_epoch_%04d.pth' % epoch)) np.savetxt( os.path.join(checkpoints_dir, 'train_losses_epoch_%04d.txt' % epoch), np.array(train_losses)) for key, optim in optims.items(): torch.save( { 'epoch': epoch, 'total_steps': total_steps, 'optimizer_state_dict': optim.state_dict() }, os.path.join( checkpoints_dir, 'optim_' + key + '_epoch_%04d.pth' % epoch)) for step, (model_input, meta, gt, misc) in enumerate(train_dataloader): start_time = time.time() for optim in optims.values(): optim.zero_grad() list_chunked_model_input, list_chunked_meta, list_chunked_gt = \ chunk_lists_from_batch_fn(model_input, meta, gt, max_chunk_size) num_chunks = len(list_chunked_gt) batch_avged_losses = {} batch_avged_tot_loss = 0. for chunk_idx, (chunked_model_input, chunked_meta, chunked_gt) \ in enumerate(zip(list_chunked_model_input, list_chunked_meta, list_chunked_gt)): chunked_model_input = dict2cuda(chunked_model_input) chunked_meta = dict2cuda(chunked_meta) chunked_gt = dict2cuda(chunked_gt) # forward pass through model chunk_model_outputs = { key: model(chunked_model_input) for key, model in models.items() } losses = loss_fn(chunk_model_outputs, chunked_gt, dataloader=train_dataloader) # loss from forward pass train_loss = 0. for loss_name, loss in losses.items(): # slowly apply loss if less than start iter if loss_name in loss_start: if total_steps < loss_start[loss_name]: loss = (total_steps / loss_start[loss_name])**2 * loss single_loss = loss.mean() train_loss += single_loss / num_chunks batch_avged_tot_loss += float(single_loss / num_chunks) if loss_name in batch_avged_losses: batch_avged_losses[ loss_name] += single_loss / num_chunks else: batch_avged_losses.update( {loss_name: single_loss / num_chunks}) if weight_decay > 0: for model in models.values(): train_loss += weight_decay * weight_decay_loss( model) train_loss.backward() for loss_name, loss in batch_avged_losses.items(): writer.add_scalar(loss_name, loss, total_steps) train_losses.append(batch_avged_tot_loss) writer.add_scalar("total_train_loss", batch_avged_tot_loss, total_steps) if clip_grad: for model in models.values(): if isinstance(clip_grad, bool): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1) else: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad) for optim in optims.values(): optim.step() if not total_steps % steps_til_summary: for key, model in models.items(): torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_' + key + '_current.pth')) for key, optim in optims.items(): torch.save( { 'epoch': epoch, 'total_steps': total_steps, 'optimizer_state_dict': optim.state_dict() }, os.path.join(checkpoints_dir, 'optim_' + key + '_current.pth')) summary_fn(models, train_dataloader, val_dataloader, loss_fn, optims, meta, gt, misc, writer, total_steps) pbar.update(1) if not total_steps % steps_til_summary: tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) total_steps += 1 for scheduler in schedulers.values(): scheduler.step() for key, model in models.items(): torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_' + key + '_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def train(model, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_fn, pruning_fn, summary_fn, double_precision=False, clip_grad=False, loss_schedules=None, resume_checkpoint={}, objs_to_save={}, epochs_til_pruning=4): optim = torch.optim.Adam(lr=lr, params=model.parameters()) # load optimizer if supplied if 'optimizer_state_dict' in resume_checkpoint: optim.load_state_dict(resume_checkpoint['optimizer_state_dict']) for g in optim.param_groups: g['lr'] = lr if os.path.exists(os.path.join(model_dir, 'summaries')): val = input("The model directory %s exists. Overwrite? (y/n)" % model_dir) if val == 'y': if os.path.exists(os.path.join(model_dir, 'summaries')): shutil.rmtree(os.path.join(model_dir, 'summaries')) if os.path.exists(os.path.join(model_dir, 'checkpoints')): shutil.rmtree(os.path.join(model_dir, 'checkpoints')) os.makedirs(model_dir, exist_ok=True) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 if 'total_steps' in resume_checkpoint: total_steps = resume_checkpoint['total_steps'] start_epoch = 0 if 'epoch' in resume_checkpoint: start_epoch = resume_checkpoint['epoch'] with tqdm(total=len(train_dataloader) * epochs) as pbar: pbar.update(total_steps) train_losses = [] for epoch in range(start_epoch, epochs): if not epoch % epochs_til_checkpoint and epoch: torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_%06d.pth' % total_steps)) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_%06d.txt' % total_steps), np.array(train_losses)) save_dict = {'epoch': epoch, 'total_steps': total_steps, 'optimizer_state_dict': optim.state_dict()} save_dict.update(objs_to_save) torch.save(save_dict, os.path.join(checkpoints_dir, 'optim_%06d.pth' % total_steps)) # prune if not epoch % epochs_til_pruning and epoch: pruning_fn(model, train_dataloader.dataset) if not (epoch + 1) % epochs_til_pruning: retile = False else: retile = True for step, (model_input, gt) in enumerate(train_dataloader): start_time = time.time() tmp = {} for key, value in model_input.items(): if isinstance(value, torch.Tensor): tmp.update({key: value.cuda()}) else: tmp.update({key: value}) model_input = tmp tmp = {} for key, value in gt.items(): if isinstance(value, torch.Tensor): tmp.update({key: value.cuda()}) else: tmp.update({key: value}) gt = tmp if double_precision: model_input = {key: value.double() for key, value in model_input.items()} gt = {key: value.double() for key, value in gt.items()} model_output = model(model_input) losses = loss_fn(model_output, gt, total_steps, retile=retile) train_loss = 0. for loss_name, loss in losses.items(): single_loss = loss.mean() if loss_schedules is not None and loss_name in loss_schedules: writer.add_scalar(loss_name + "_weight", loss_schedules[loss_name](total_steps), total_steps) single_loss *= loss_schedules[loss_name](total_steps) writer.add_scalar(loss_name, single_loss, total_steps) train_loss += single_loss train_losses.append(train_loss.item()) writer.add_scalar("total_train_loss", train_loss, total_steps) optim.zero_grad() train_loss.backward() if clip_grad: if isinstance(clip_grad, bool): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.) else: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad) optim.step() pbar.update(1) if not total_steps % steps_til_summary: tqdm.write("Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) summary_fn(model, model_input, gt, model_output, writer, total_steps) total_steps += 1 # after epoch tqdm.write("Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) # save model at end of epoch torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final_%06d.pth' % total_steps)) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final_%06d.txt' % total_steps), np.array(train_losses)) save_dict = {'epoch': epoch, 'total_steps': total_steps, 'optimizer_state_dict': optim.state_dict()} save_dict.update(objs_to_save) torch.save(save_dict, os.path.join(checkpoints_dir, 'optim_final_%06d.pth' % total_steps))
def main(): if opt.dataset == 'camera': img_dataset = dataio.Camera() elif opt.dataset == 'pluto': pluto_url = "https://upload.wikimedia.org/wikipedia/commons/e/ef/Pluto_in_True_Color_-_High-Res.jpg" img_dataset = dataio.ImageFile('../data/pluto.jpg', url=pluto_url, grayscale=opt.grayscale) elif opt.dataset == 'tokyo': img_dataset = dataio.ImageFile('../data/tokyo.tif', grayscale=opt.grayscale) elif opt.dataset == 'mars': img_dataset = dataio.ImageFile('../data/mars.tif', grayscale=opt.grayscale) if len(opt.patch_size) == 1: opt.patch_size = 3*opt.patch_size # set up dataset coord_dataset = dataio.Patch2DWrapperMultiscaleAdaptive(img_dataset, sidelength=opt.res, patch_size=opt.patch_size[1:], jitter=True, num_workers=opt.num_workers, length=opt.steps_til_tiling, scale_init=opt.scale_init, max_patches=opt.max_patches) opt.num_epochs = opt.num_iters // coord_dataset.__len__() image_resolution = (opt.res, opt.res) dataloader = DataLoader(coord_dataset, shuffle=False, batch_size=1, pin_memory=True, num_workers=opt.num_workers) if opt.resume is not None: path, iter = opt.resume iter = int(iter) assert(os.path.isdir(path)) assert opt.config is not None, 'Specify config file' # Define the model. if opt.grayscale: out_features = 1 else: out_features = 3 if opt.model_type == 'multiscale': model = modules.ImplicitAdaptivePatchNet(in_features=3, out_features=out_features, num_hidden_layers=opt.hidden_layers, hidden_features=opt.hidden_features, feature_grid_size=(opt.patch_size[0], opt.patch_size[1], opt.patch_size[2]), sidelength=opt.res, num_encoding_functions=10, patch_size=opt.patch_size[1:]) elif opt.model_type == 'siren': model = modules.ImplicitNet(opt.res, in_features=2, out_features=out_features, num_hidden_layers=4, hidden_features=1536, mode='siren', w0=opt.w0) elif opt.model_type == 'pe': model = modules.ImplicitNet(opt.res, in_features=2, out_features=out_features, num_hidden_layers=4, hidden_features=1536, mode='pe') else: raise NotImplementedError('Only model types multiscale, siren, and pe are implemented') model.cuda() # print number of model parameters model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(f'Num. Parameters: {params}') # Define the loss loss_fn = partial(loss_functions.image_mse, tiling_every=opt.steps_til_tiling, dataset=coord_dataset, model_type=opt.model_type) summary_fn = partial(utils.write_image_patch_multiscale_summary, image_resolution, opt.patch_size[1:], coord_dataset, model_type=opt.model_type, skip=opt.skip_logging) # Define the pruning function pruning_fn = partial(pruning_functions.no_pruning, pruning_every=1) # if we are resuming from a saved checkpoint if opt.resume is not None: print('Loading checkpoints') model_dict = torch.load(path + '/checkpoints/' + f'model_{iter:06d}.pth') model.load_state_dict(model_dict) # load optimizers try: resume_checkpoint = {} optim_dict = torch.load(path + '/checkpoints/' + f'optim_{iter:06d}.pth') for g in optim_dict['optimizer_state_dict']['param_groups']: g['lr'] = opt.lr resume_checkpoint['optimizer_state_dict'] = optim_dict['optimizer_state_dict'] resume_checkpoint['total_steps'] = optim_dict['total_steps'] resume_checkpoint['epoch'] = optim_dict['epoch'] # initialize model state_dict print('Initializing models') coord_dataset.quadtree.__load__(optim_dict['quadtree']) coord_dataset.synchronize() except FileNotFoundError: print('Unable to load optimizer checkpoints') else: resume_checkpoint = {} if opt.eval: run_eval(model, coord_dataset) else: # Save command-line parameters log directory. root_path = os.path.join(opt.logging_root, opt.experiment_name) utils.cond_mkdir(root_path) p.write_config_file(opt, [os.path.join(root_path, 'config.ini')]) # Save text summary of model into log directory. with open(os.path.join(root_path, "model.txt"), "w") as out_file: out_file.write(str(model)) objs_to_save = {'quadtree': coord_dataset.quadtree} training.train(model=model, train_dataloader=dataloader, epochs=opt.num_epochs, lr=opt.lr, steps_til_summary=opt.steps_til_summary, epochs_til_checkpoint=opt.epochs_til_ckpt, model_dir=root_path, loss_fn=loss_fn, pruning_fn=pruning_fn, summary_fn=summary_fn, objs_to_save=objs_to_save, resume_checkpoint=resume_checkpoint)
def run_eval(model, coord_dataset): # get checkpoint directory checkpoint_dir = os.path.join(os.path.dirname(opt.config), 'checkpoints') # make eval directory eval_dir = os.path.join(os.path.dirname(opt.config), 'eval') utils.cond_mkdir(eval_dir) # get model & optim files model_files = sorted([f for f in os.listdir(checkpoint_dir) if re.search(r'model_[0-9]+.pth', f)], reverse=True) optim_files = sorted([f for f in os.listdir(checkpoint_dir) if re.search(r'optim_[0-9]+.pth', f)], reverse=True) # extract iterations iters = [int(re.search(r'[0-9]+', f)[0]) for f in model_files] # append beginning of path model_files = [os.path.join(checkpoint_dir, f) for f in model_files] optim_files = [os.path.join(checkpoint_dir, f) for f in optim_files] # iterate through model and optim files metrics = {} saved_gt = False for curr_iter, model_path, optim_path in zip(tqdm(iters), model_files, optim_files): # load model and optimizer files print('Loading models') model_dict = torch.load(model_path) optim_dict = torch.load(optim_path) # initialize model state_dict print('Initializing models') model.load_state_dict(model_dict) coord_dataset.quadtree.__load__(optim_dict['quadtree']) coord_dataset.synchronize() # save image and calculate psnr coord_dataset.toggle_eval() model_input, gt = coord_dataset[0] coord_dataset.toggle_eval() # convert to cuda and add batch dimension tmp = {} for key, value in model_input.items(): if isinstance(value, torch.Tensor): tmp.update({key: value[None, ...].cpu()}) else: tmp.update({key: value}) model_input = tmp tmp = {} for key, value in gt.items(): if isinstance(value, torch.Tensor): tmp.update({key: value[None, ...].cpu()}) else: tmp.update({key: value}) gt = tmp # run the model on uniform samples print('Running forward pass') n_channels = gt['img'].shape[-1] start = time() with torch.no_grad(): pred_img = utils.process_batch_in_chunks(model_input, model, max_chunk_size=512)['model_out']['output'] torch.cuda.synchronize() print(f'Model: {time() - start:.02f}') # get pixel idx for each coordinate start = time() coords = model_input['fine_abs_coords'].detach().cpu().numpy() pixel_idx = np.zeros_like(coords).astype(np.int32) pixel_idx[..., 0] = np.round((coords[..., 0] + 1.)/2. * (coord_dataset.sidelength[0]-1)).astype(np.int32) pixel_idx[..., 1] = np.round((coords[..., 1] + 1.)/2. * (coord_dataset.sidelength[1]-1)).astype(np.int32) pixel_idx = pixel_idx.reshape(-1, 2) # assign predicted image values into a new array # need to use numpy since it supports index assignment pred_img = pred_img.detach().cpu().numpy().reshape(-1, n_channels) display_pred = np.zeros((*coord_dataset.sidelength, n_channels)) display_pred[[pixel_idx[:, 0]], [pixel_idx[:, 1]]] = pred_img display_pred = torch.tensor(display_pred)[None, ...] display_pred = display_pred.permute(0, 3, 1, 2) if not saved_gt: gt_img = gt['img'].detach().cpu().numpy().reshape(-1, n_channels) display_gt = np.zeros((*coord_dataset.sidelength, n_channels)) display_gt[[pixel_idx[:, 0]], [pixel_idx[:, 1]]] = gt_img display_gt = torch.tensor(display_gt)[None, ...] display_gt = display_gt.permute(0, 3, 1, 2) print(f'Reshape: {time() - start:.02f}') # record metrics start = time() psnr, ssim = get_metrics(display_pred, display_gt) metrics.update({curr_iter: {'psnr': psnr, 'ssim': ssim}}) print(f'Metrics: {time() - start:.02f}') print(f'Iter: {curr_iter}, PSNR: {psnr:.02f}') # save images pred_out = np.clip((display_pred.squeeze().numpy()/2.) + 0.5, a_min=0., a_max=1.).transpose(1, 2, 0)*255 pred_out = pred_out.astype(np.uint8) pred_fname = os.path.join(eval_dir, f'pred_{curr_iter:06d}.png') print('Saving image') cv2.imwrite(pred_fname, cv2.cvtColor(pred_out, cv2.COLOR_RGB2BGR)) if not saved_gt: print('Saving gt') gt_out = np.clip((display_gt.squeeze().numpy()/2.) + 0.5, a_min=0., a_max=1.).transpose(1, 2, 0)*255 gt_out = gt_out.astype(np.uint8) gt_fname = os.path.join(eval_dir, 'gt.png') cv2.imwrite(gt_fname, cv2.cvtColor(gt_out, cv2.COLOR_RGB2BGR)) saved_gt = True # save tiling tiling_fname = os.path.join(eval_dir, f'tiling_{curr_iter:06d}.pdf') coord_dataset.quadtree.draw() plt.savefig(tiling_fname) # save metrics metric_fname = os.path.join(eval_dir, f'metrics_{curr_iter:06d}.npy') np.save(metric_fname, metrics)
def train(model, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_fn, summary_fn, val_dataloader=None, double_precision=False, clip_grad=False, use_lbfgs=False, loss_schedules=None): optim = torch.optim.Adam(lr=lr, params=model.parameters()) if os.path.exists(model_dir): val = input("The model directory %s exists. Overwrite? (y/n)" % model_dir) #val = 'y' if val == 'y': shutil.rmtree(model_dir) os.makedirs(model_dir) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(epochs): if not epoch % epochs_til_checkpoint and epoch: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) np.savetxt( os.path.join(checkpoints_dir, 'train_losses_epoch_%04d.txt' % epoch), np.array(train_losses)) for step, batch in enumerate(train_dataloader): start_time = time.time() model_input, gt = convert_metadata(batch['train']) test_model_input, test_gt = convert_metadata(batch['test']) _, train_targets = batch['train'] _, test_targets = batch['test'] num_tasks = test_targets.size(0) num_adaptation_steps = 10 step_size = 0.001 results = { 'num_tasks': num_tasks, 'inner_losses': np.zeros((num_adaptation_steps, num_tasks), dtype=np.float32), 'outer_losses': np.zeros((num_tasks, ), dtype=np.float32), 'mean_outer_loss': 0. } mean_outer_loss = torch.tensor(0.).cuda() for task_num in range(train_targets.shape[0]): params, adaptation_results = adapt( model, loss_fn, model_input, gt, num_adaptation_steps=num_adaptation_steps, step_size=step_size, writer=writer, summary_fn=summary_fn) results['inner_losses'][:, task_num] = adaptation_results[ 'inner_losses'] # do the same processing with the test dataset test_model_output = model(test_model_input, test=True, params=params) outer_loss = loss_fn(test_model_output, test_gt) img_loss = outer_loss['img_loss'] results['outer_losses'][task_num] = img_loss mean_outer_loss += img_loss mean_outer_loss.div_(task_num) results['mean_outer_loss'] = mean_outer_loss.item() writer.add_scalar('mean_outer_loss', results['mean_outer_loss'], step) mean_outer_loss.backward() optim.step() ################### train_loss = 0.0 for loss_name, loss in outer_loss.items(): single_loss = loss.mean() writer.add_scalar(loss_name, single_loss, total_steps) train_loss += single_loss train_losses.append(train_loss.item()) if not total_steps % steps_til_summary: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_current.pth')) summary_fn(model, test_model_input, test_gt, test_model_output, writer, total_steps, inner=False) # if not use_lbfgs: # optim.zero_grad() # train_loss.backward() # # optim.step() pbar.update(1) if not total_steps % steps_til_summary: tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) total_steps += 1 torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def getTestMSE(dataloader, subdir): MSEs = [] PSNRs = [] total_steps = 0 utils.cond_mkdir(os.path.join(root_path, subdir)) utils.cond_mkdir(os.path.join(root_path, 'ground_truth')) with tqdm(total=len(dataloader)) as pbar: for step, (model_input, gt) in enumerate(dataloader): model_input['idx'] = torch.Tensor([model_input['idx']]).long() model_input = { key: value.cuda() for key, value in model_input.items() } gt = {key: value.cuda() for key, value in gt.items()} with torch.no_grad(): model_output = model(model_input) out_img = dataio.lin2img(model_output['model_out'], image_resolution).squeeze().permute( 1, 2, 0).detach().cpu().numpy() out_img += 1 out_img /= 2. out_img = np.clip(out_img, 0., 1.) gt_img = dataio.lin2img(gt['img'], image_resolution).squeeze().permute( 1, 2, 0).detach().cpu().numpy() gt_img += 1 gt_img /= 2. gt_img = np.clip(gt_img, 0., 1.) sparse_img = np.ones((image_resolution[0], image_resolution[1], 3)) coords_sub = model_input['coords_sub'].squeeze().detach().cpu( ).numpy() rgb_sub = model_input['img_sub'].squeeze().detach().cpu().numpy() for index in range(0, coords_sub.shape[0]): r = int(round((coords_sub[index][0] + 1) / 2 * 31)) c = int(round((coords_sub[index][1] + 1) / 2 * 31)) sparse_img[r, c, :] = np.clip((rgb_sub[index, :] + 1) / 2, 0., 1.) imageio.imwrite( os.path.join(root_path, subdir, str(total_steps) + '_sparse.png'), to_uint8(sparse_img)) imageio.imwrite( os.path.join(root_path, subdir, str(total_steps) + '.png'), to_uint8(out_img)) imageio.imwrite( os.path.join(root_path, 'ground_truth', str(total_steps) + '.png'), to_uint8(gt_img)) MSE = np.mean((out_img - gt_img)**2) MSEs.append(MSE) PSNR = skimage.measure.compare_psnr(out_img, gt_img, data_range=1) PSNRs.append(PSNR) pbar.update(1) total_steps += 1 return MSEs, PSNRs
def train_with_signed_distance(model, train_dataloader, val_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, supervision='dense'): assert (supervision in ['levelset', 'dense']) optim = torch.optim.Adam(lr=lr, params=model.parameters()) if not os.path.exists(model_dir): os.makedirs(model_dir) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(epochs): for step, (model_input, gt) in enumerate(train_dataloader): start_time = time.time() model_input = {key: value.cuda() for key, value in model_input.items()} gt = {key: value.cuda() for key, value in gt.items()} if supervision=='levelset': # Use level_set only model_input_level_set = model_input.copy() model_input_level_set['coords'] = model_input_level_set['level_set'] model_input_level_set = {key: value.cuda() for key, value in model_input_level_set.items()} pred_sd, z = model.legacy_forward(**model_input_level_set) loss = modules.sdf_loss(pred_sd, gt['ls_sds']) + torch.mean(z ** 2) elif supervision=='dense': # Use standard coords pred_sd, z = model.legacy_forward(**model_input) loss = modules.sdf_loss(pred_sd, gt['sds']) + torch.mean(z ** 2) train_losses.append(loss.item()) writer.add_scalar("train_loss", loss, total_steps) optim.zero_grad() loss.backward() optim.step() pbar.update(1) if not total_steps % steps_til_summary: corrected_loss = utils.evaluate_model(model, train_dataloader) writer.add_scalar('corrected_loss', corrected_loss, total_steps) pred_sd, z = model.legacy_forward(**model_input) tqdm.write("Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, corrected_loss, time.time() - start_time)) utils.write_summaries(pred_sd, model_input, gt, writer, total_steps, 'train_') if val_dataloader is not None: print("Running validation set...") model.eval() with torch.no_grad(): val_losses = [] for meta_batch in val_dataloader: pred_sd = model(meta_batch) val_loss = modules.sdf_loss(pred_sd, meta_batch['test'][1].cuda()) val_losses.append(val_loss) writer.add_scalar("val_loss", np.mean(val_losses), total_steps) utils.write_meta_summaries(pred_sd, meta_batch, writer, total_steps, 'val_') model.train() total_steps += 1 if not epoch % epochs_til_checkpoint and epoch: torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses)) torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def train_with_signed_distance_meta(model, train_dataloader, val_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir): optim = torch.optim.Adam(lr=lr, params=model.parameters()) if os.path.exists(model_dir): val = input("The model directory %s exists. Overwrite? (y/n)"%model_dir) if val == 'y': shutil.rmtree(model_dir) os.makedirs(model_dir) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print (f'\n\nTraining model with {num_parameters} parameters\n\n') writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(epochs): for step, meta_batch in enumerate(train_dataloader): start_time = time.time() pred_sd, _ = model(meta_batch) loss = modules.sdf_loss(pred_sd, meta_batch['test'][1].cuda()) train_losses.append(loss) writer.add_scalar("train_loss", loss, total_steps) optim.zero_grad() loss.backward() optim.step() pbar.update(1) tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, loss, time.time() - start_time)) if not total_steps % steps_til_summary: utils.write_meta_summaries(pred_sd, meta_batch, writer, total_steps, 'train_') print("Running validation set...") model.eval() with torch.no_grad(): val_losses = [] for val_idx, meta_batch in enumerate(val_dataloader): pred_sd, _ = model(meta_batch) val_loss = modules.sdf_loss(pred_sd, meta_batch['test'][1].cuda()) val_losses.append(val_loss.cpu().numpy()) if not val_idx: utils.write_meta_summaries(pred_sd, meta_batch, writer, total_steps, 'val_') writer.add_scalar("val_loss", np.mean(val_losses), total_steps) tqdm.write("Validation loss %0.6e" % loss) model.train() total_steps += 1 if not epoch % epochs_til_checkpoint: torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'epoch_%03d.pth'%epoch)) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_epoch_%03d.txt'%epoch), np.array(train_losses)) torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def train(model, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_fn, summary_fn, val_dataloader=None, double_precision=False, clip_grad=False, use_lbfgs=False, loss_schedules=None): optim = torch.optim.Adam(lr=lr, params=model.parameters()) # copy settings from Raissi et al. (2019) and here # https://github.com/maziarraissi/PINNs if use_lbfgs: optim = torch.optim.LBFGS(lr=lr, params=model.parameters(), max_iter=50000, max_eval=50000, history_size=50, line_search_fn='strong_wolfe') if os.path.exists(model_dir): #val = input("The model directory %s exists. Overwrite? (y/n)"%model_dir) val = 'y' if val == 'y': shutil.rmtree(model_dir) os.makedirs(model_dir) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(epochs): if not epoch % epochs_til_checkpoint and epoch: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) np.savetxt( os.path.join(checkpoints_dir, 'train_losses_epoch_%04d.txt' % epoch), np.array(train_losses)) for step, (model_input, gt) in enumerate(train_dataloader): start_time = time.time() model_input = { key: value.cuda() for key, value in model_input.items() } gt = {key: value.cuda() for key, value in gt.items()} model_output = model(model_input) losses = loss_fn(model_output, gt) train_loss = 0. for loss_name, loss in losses.items(): single_loss = loss.mean() if loss_schedules is not None and loss_name in loss_schedules: writer.add_scalar( loss_name + "_weight", loss_schedules[loss_name](total_steps), total_steps) single_loss *= loss_schedules[loss_name](total_steps) writer.add_scalar(loss_name, single_loss, total_steps) train_loss += single_loss train_losses.append(train_loss.item()) writer.add_scalar("total_train_loss", train_loss, total_steps) if not total_steps % steps_til_summary: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_current.pth')) summary_fn(model, model_input, gt, model_output, writer, total_steps) if not use_lbfgs: optim.zero_grad() train_loss.backward() optim.step() pbar.update(1) if not total_steps % steps_til_summary: tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) if val_dataloader is not None: print("Running validation set...") model.eval() with torch.no_grad(): val_losses = [] for (model_input, gt) in val_dataloader: model_output = model(model_input) val_loss = loss_fn(model_output, gt) val_losses.append(val_loss) writer.add_scalar("val_loss", np.mean(val_losses), total_steps) model.train() total_steps += 1 torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def train(model, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_fn, summary_fn=None, val_dataloader=None, double_precision=False, clip_grad=False, use_lbfgs=False, loss_schedules=None, validation_fn=None, start_epoch=0): optim = torch.optim.Adam(lr=lr, params=model.parameters()) # copy settings from Raissi et al. (2019) and here # https://github.com/maziarraissi/PINNs if use_lbfgs: optim = torch.optim.LBFGS(lr=lr, params=model.parameters(), max_iter=50000, max_eval=50000, history_size=50, line_search_fn='strong_wolfe') # Load the checkpoint if required if start_epoch > 0: # Load the model and start training from that point onwards model_path = os.path.join(model_dir, 'checkpoints', 'model_epoch_%04d.pth' % start_epoch) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model']) model.train() optim.load_state_dict(checkpoint['optimizer']) optim.param_groups[0]['lr'] = lr assert (start_epoch == checkpoint['epoch']) else: # Start training from scratch if os.path.exists(model_dir): val = input("The model directory %s exists. Overwrite? (y/n)" % model_dir) if val == 'y': shutil.rmtree(model_dir) os.makedirs(model_dir) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(start_epoch, epochs): if not epoch % epochs_til_checkpoint and epoch: # Saving the optimizer state is important to produce consistent results checkpoint = { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optim.state_dict() } torch.save( checkpoint, os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) # torch.save(model.state_dict(), # os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) np.savetxt( os.path.join(checkpoints_dir, 'train_losses_epoch_%04d.txt' % epoch), np.array(train_losses)) if validation_fn is not None: validation_fn(model, checkpoints_dir, epoch) for step, (model_input, gt) in enumerate(train_dataloader): start_time = time.time() if torch.cuda.is_available(): model_input = { key: value.cuda() for key, value in model_input.items() } gt = {key: value.cuda() for key, value in gt.items()} else: model_input = { key: value.cpu() for key, value in model_input.items() } gt = {key: value.cpu() for key, value in gt.items()} if double_precision: model_input = { key: value.double() for key, value in model_input.items() } gt = {key: value.double() for key, value in gt.items()} if use_lbfgs: def closure(): optim.zero_grad() model_output = model(model_input) losses = loss_fn(model_output, gt) train_loss = 0. for loss_name, loss in losses.items(): train_loss += loss.mean() train_loss.backward() return train_loss optim.step(closure) model_output = model(model_input) losses = loss_fn(model_output, gt) # import ipdb; ipdb.set_trace() train_loss = 0. for loss_name, loss in losses.items(): single_loss = loss.mean() if loss_schedules is not None and loss_name in loss_schedules: writer.add_scalar( loss_name + "_weight", loss_schedules[loss_name](total_steps), total_steps) single_loss *= loss_schedules[loss_name](total_steps) writer.add_scalar(loss_name, single_loss, total_steps) train_loss += single_loss train_losses.append(train_loss.item()) writer.add_scalar("total_train_loss", train_loss, total_steps) if not total_steps % steps_til_summary: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_current.pth')) # summary_fn(model, model_input, gt, model_output, writer, total_steps) if not use_lbfgs: optim.zero_grad() train_loss.backward() if clip_grad: if isinstance(clip_grad, bool): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.) else: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad) optim.step() pbar.update(1) if not total_steps % steps_til_summary: tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) if val_dataloader is not None: print("Running validation set...") model.eval() with torch.no_grad(): val_losses = [] for (model_input, gt) in val_dataloader: model_output = model(model_input) val_loss = loss_fn(model_output, gt) val_losses.append(val_loss) writer.add_scalar("val_loss", np.mean(val_losses), total_steps) model.train() total_steps += 1 torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def plot(stats_list): avg_reward_1 = np.array([stats[0] for stats in stats_list]) std_reward_1 = np.array([stats[1] for stats in stats_list]) avg_reward_2 = np.array([stats[2] for stats in stats_list]) std_reward_2 = np.array([stats[3] for stats in stats_list]) avg_reward_team = np.array([stats[4] for stats in stats_list]) std_reward_team = np.array([stats[5] for stats in stats_list]) num_wins_1 = np.array([stats[6] for stats in stats_list]) num_wins_2 = np.array([stats[7] for stats in stats_list]) num_wins_team = np.array([stats[8] for stats in stats_list]) episode = np.arange(1, len(avg_reward_1) + 1) utils.cond_mkdir('./plots_2_agent/') plt.figure() plt.plot(episode, avg_reward_1) reward_upper = avg_reward_1 + std_reward_1 reward_lower = avg_reward_1 - std_reward_1 plt.fill_between(episode, reward_lower, reward_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlabel('Evaluation #') plt.ylabel('Reward') plt.title('Average Reward of Player 1') plt.savefig('./plots_2_agent/reward_1') plt.figure() plt.plot(episode, avg_reward_2) reward_upper = avg_reward_2 + std_reward_2 reward_lower = avg_reward_2 - std_reward_2 plt.fill_between(episode, reward_lower, reward_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlabel('Evaluation #') plt.ylabel('Reward') plt.title('Average Reward of Player 2') plt.savefig('./plots_2_agent/reward_2') plt.figure() plt.plot(episode, avg_reward_team) reward_upper = avg_reward_team + std_reward_team reward_lower = avg_reward_team - std_reward_team plt.fill_between(episode, reward_lower, reward_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlabel('Evaluation #') plt.ylabel('Reward') plt.title('Average Reward of Team') plt.savefig('./plots_2_agent/reward_team') plt.figure() plt.plot(episode, 100 * num_wins_1 / env.config.EVAL_EPISODE) plt.xlabel('Evaluation #') plt.ylabel('Win (%)') plt.title('Win % of Player 1') plt.savefig('./plots_2_agent/wins_1') plt.figure() plt.plot(episode, 100 * num_wins_2 / env.config.EVAL_EPISODE) plt.xlabel('Evaluation #') plt.ylabel('Win (%)') plt.title('Win % of Player 2') plt.savefig('./plots_2_agent/wins_2') plt.figure() plt.plot(episode, 100 * num_wins_team / env.config.EVAL_EPISODE) plt.xlabel('Evaluation #') plt.ylabel('Win (%)') plt.title('Win % of Team') plt.savefig('./plots_2_agent/wins_team')
memory.push( torch.tensor(state, device=device).unsqueeze(0), torch.tensor(action, device=device), torch.tensor(next_state, device=device).unsqueeze(0), torch.tensor(reward, device=device)) # print('THIS HAPPENS') # Perform one step of the optimization (on the target network) optimize_model(input_stack, env) if done: break env.render() # Update the target network, copying all weights and biases in Tron_DQN if e % env.config.TARGET_UPDATE_FREQUENCY == 0: target_net.load_state_dict(policy_net.state_dict()) if e % env.config.MODEL_EVAL_FREQUENCY == 0: stats_list.append(evaluate(policy_net)) plot(stats_list) if e % env.config.MODEL_SAVE_FREQUENCY == 0: print('Saving model') utils.cond_mkdir('./models/') torch.save(policy_net, os.path.join('./models/', 'episode_%d.pth' % (e))) print('Complete') env.render() plot(stats_list) # env.close()
def main(): root_path = os.path.join(opt.logging_root, opt.experiment_name) utils.cond_mkdir(root_path) point_cloud_dataset = dataio.OccupancyDataset(opt.pc_filepath) coord_dataset = dataio.Block3DWrapperMultiscaleAdaptive( point_cloud_dataset, sidelength=opt.res, octant_size=opt.octant_size, jitter=True, max_octants=opt.max_octants, num_workers=opt.num_workers, length=opt.steps_til_tiling, scale_init=opt.scale_init) model = modules.ImplicitAdaptiveOctantNet( in_features=3 + 1, out_features=1, num_hidden_layers=opt.hidden_layers, hidden_features=opt.hidden_features, feature_grid_size=feature_grid_size, octant_size=opt.octant_size) model.cuda() resume_checkpoint = {} if opt.load is not None: resume_checkpoint = load_from_checkpoint(opt.load, model, coord_dataset) if opt.export: assert opt.load is not None, 'Need to specify which model to export with --load' export_mesh(model, coord_dataset, opt.upsample, opt.mc_threshold) return num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"\n\nTrainable Parameters: {num_params}\n\n") dataloader = DataLoader(coord_dataset, shuffle=False, batch_size=1, pin_memory=True, num_workers=opt.num_workers) # Define the loss loss_fn = partial(loss_functions.occupancy_bce, tiling_every=opt.steps_til_tiling, dataset=coord_dataset) summary_fn = partial(utils.write_occupancy_multiscale_summary, (opt.res, opt.res, opt.res), coord_dataset, output_mrc=f'{opt.experiment_name}.mrc', skip=opt.skip_logging) # Define the pruning pruning_fn = partial(pruning_functions.pruning_occupancy, threshold=opt.pruning_threshold) # Save command-line parameters log directory. p.write_config_file(opt, [os.path.join(root_path, 'config.ini')]) # Save text summary of model into log directory. with open(os.path.join(root_path, "model.txt"), "w") as out_file: out_file.write(str(model)) objs_to_save = {'octtree': coord_dataset.octtree} training.train(model=model, train_dataloader=dataloader, epochs=opt.num_epochs, lr=opt.lr, steps_til_summary=opt.steps_til_summary, epochs_til_checkpoint=opt.epochs_til_ckpt, model_dir=root_path, loss_fn=loss_fn, summary_fn=summary_fn, objs_to_save=objs_to_save, pruning_fn=pruning_fn, epochs_til_pruning=opt.epochs_til_pruning, resume_checkpoint=resume_checkpoint)
def train(model, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_fn, summary_fn, prefix_model_dir='', val_dataloader=None, double_precision=False, clip_grad=False, use_lbfgs=False, loss_schedules=None, params=None): if params is None: optim = torch.optim.Adam(lr=lr, params=model.parameters(), amsgrad=True) else: optim = torch.optim.Adam(lr=lr, params=params, amsgrad=True) if use_lbfgs: optim = torch.optim.LBFGS(lr=lr, params=model.parameters(), max_iter=50000, max_eval=50000, history_size=50, line_search_fn='strong_wolfe') if os.path.exists(model_dir): pass else: os.makedirs(model_dir) model_dir_postfixed = os.path.join(model_dir, prefix_model_dir) summaries_dir = os.path.join(model_dir_postfixed, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir_postfixed, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(epochs): if not epoch % epochs_til_checkpoint and epoch: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) np.savetxt( os.path.join(checkpoints_dir, 'train_losses_epoch_%04d.txt' % epoch), np.array(train_losses)) for step, (model_input, gt) in enumerate(train_dataloader): start_time = time.time() tmp = {} for key, value in model_input.items(): if isinstance(value, torch.Tensor): tmp.update({key: value.cuda()}) else: tmp.update({key: value}) model_input = tmp gt = {key: value.cuda() for key, value in gt.items()} if double_precision: model_input = { key: value.double() for key, value in model_input.items() } gt = {key: value.double() for key, value in gt.items()} if use_lbfgs: def closure(): optim.zero_grad() model_output = model(model_input) losses = loss_fn(model_output, gt) train_loss = 0. for loss_name, loss in losses.items(): train_loss += loss.mean() train_loss.backward() return train_loss optim.step(closure) model_output = model(model_input) losses = loss_fn(model_output, gt) train_loss = 0. for loss_name, loss in losses.items(): single_loss = loss.mean() if loss_schedules is not None and loss_name in loss_schedules: writer.add_scalar( loss_name + "_weight", loss_schedules[loss_name](total_steps), total_steps) single_loss *= loss_schedules[loss_name](total_steps) writer.add_scalar(loss_name, single_loss, total_steps) train_loss += single_loss train_losses.append(train_loss.item()) writer.add_scalar("total_train_loss", train_loss, total_steps) if not total_steps % steps_til_summary: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_current.pth')) summary_fn(model, model_input, gt, model_output, writer, total_steps) if not use_lbfgs: optim.zero_grad() train_loss.backward() if clip_grad: if isinstance(clip_grad, bool): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.) else: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad) optim.step() pbar.update(1) if not total_steps % steps_til_summary: # summary_fn(model_input, gt, model_output, writer, total_steps) tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) if val_dataloader is not None: print("Running validation set...") model.eval() with torch.no_grad(): val_losses = [] for (model_input, gt) in val_dataloader: model_output = model(model_input) val_loss = loss_fn(model_output, gt) val_losses.append(val_loss) writer.add_scalar("val_loss", np.mean(val_losses), total_steps) model.train() total_steps += 1 torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def train(validation=True): root_path = os.path.join(opt.logging_root, opt.experiment_name) utils.cond_mkdir(root_path) ''' Training dataset ''' if opt.dataset == 'deepvoxels': dataset = dataio.DeepVoxelDataset(opt.dv_dataset_path, mode='train', resize_to=2 * (opt.img_size, )) use_ndc = False elif opt.dataset == 'llff': dataset = dataio.LLFFDataset(opt.llff_dataset_path, mode='train') use_ndc = True elif opt.dataset == 'blender': dataset = dataio.NerfBlenderDataset( opt.nerf_dataset_path, splits=['train' ], # which split to load: either 'train', 'val', 'test' mode='train', # which split to train on (must be in splits) resize_to=2 * (opt.img_size, ), ref_rot=None, d_rot=None) use_ndc = False else: raise NotImplementedError('dataset not implemented') coords_dataset = dataio.Implicit6DMultiviewDataWrapper( dataset, dataset.get_img_shape(), dataset.get_camera_params(), samples_per_ray=opt.samples_per_ray, samples_per_view=opt.samples_per_view, use_ndc=use_ndc) ''' Validation dataset ''' if validation: if opt.dataset == 'deepvoxels': val_dataset = dataio.DeepVoxelDataset(opt.dv_dataset_path, mode='val', idcs=dataset.val_idcs, resize_to=2 * (opt.img_size, )) elif opt.dataset == 'llff': val_dataset = dataio.LLFFDataset(opt.llff_dataset_path, mode='val') elif opt.dataset == 'blender': val_dataset = dataio.NerfBlenderDataset( opt.nerf_dataset_path, splits=[ 'val' ], # which split to load: either 'train', 'val', 'test' mode='val', # which split to train on (must be in splits) resize_to=2 * (opt.img_size, ), ref_rot=None, d_rot=None) val_coords_dataset = dataio.Implicit6DMultiviewDataWrapper( val_dataset, val_dataset.get_img_shape(), val_dataset.get_camera_params(), samples_per_ray=opt.samples_per_ray, samples_per_view=np.prod(val_dataset.get_img_shape()[:2]), num_workers=opt.num_workers, sobol_ray_sampling=opt.use_sobol_ray_sampling, use_ndc=use_ndc) ''' Dataloaders''' dataloader = DataLoader( coords_dataset, shuffle=True, batch_size=opt.batch_size, # num of views in a batch pin_memory=True, num_workers=opt.num_workers) if validation: val_dataloader = DataLoader(val_coords_dataset, shuffle=True, batch_size=1, pin_memory=True, num_workers=opt.num_workers) else: val_dataloader = None # get model paths if opt.resume is not None: path, epoch = opt.resume epoch = int(epoch) assert (os.path.isdir(path)) assert opt.config is not None, 'Specify config file' if opt.use_sampler: cam_params = dataset.get_camera_params() sampler = modules.SamplingNet(Nt=opt.samples_per_ray, ncuts=opt.num_cuts, sampling_interval=(cam_params['near'], cam_params['far'])) else: sampler = None add_pe_ray_samples = 10 # 10 cos + sin add_pe_orientations = 4 # 4 cos + sin nl_types = opt.activation model_sigma = modules.RadianceNet( out_features=1, hidden_layers=opt.hidden_layers, hidden_features=opt.hidden_features, nl=nl_types, use_grad=opt.use_grad, input_name=['ray_samples', 'ray_orientations'], input_processing_fn=modules.input_processing_fn, input_pe_params={ 'ray_samples': add_pe_ray_samples, 'ray_orientations': add_pe_orientations }, sampler=sampler, normalize_pe=opt.normalize_pe) model_sigma.cuda() model_rgb = modules.RadianceNet( out_features=3, hidden_layers=opt.hidden_layers, hidden_features=opt.hidden_features, nl=nl_types, use_grad=opt.use_grad, input_name=['ray_samples', 'ray_orientations'], input_processing_fn=modules.input_processing_fn, input_pe_params={ 'ray_samples': add_pe_ray_samples, 'ray_orientations': add_pe_orientations }, sampler=sampler, normalize_pe=opt.normalize_pe) model_rgb.cuda() if opt.resume is not None: if (epoch > 0): model_path_sigma = path + '/checkpoints/' + f'model_sigma_epoch_{epoch:04d}.pth' model_path_rgb = path + '/checkpoints/' + f'model_rgb_epoch_{epoch:04d}.pth' else: model_path_sigma = path + '/checkpoints/' + 'model_sigma_current.pth' model_path_rgb = path + '/checkpoints/' + 'model_rgb_current.pth' print('Loading checkpoints') ckpt_dict = torch.load(model_path_sigma) state_dict = translate_saved_weights(ckpt_dict, model_sigma) model_sigma.load_state_dict(state_dict, strict=True) ckpt_dict = torch.load(model_path_rgb) state_dict = translate_saved_weights(ckpt_dict, model_rgb) model_rgb.load_state_dict(state_dict, strict=True) # load optimizers try: if (epoch > 0): optim_path_sigma = path + '/checkpoints/' + f'optim_sigma_epoch_{epoch:04d}.pth' optim_path_rgb = path + '/checkpoints/' + f'optim_rgb_epoch_{epoch:04d}.pth' else: optim_path_sigma = path + '/checkpoints/' + 'optim_sigma_current.pth' optim_path_rgb = path + '/checkpoints/' + 'optim_rgb_current.pth' resume_checkpoint = {} sigma_ckpt = torch.load(optim_path_sigma) for g in sigma_ckpt['optimizer_state_dict']['param_groups']: g['lr'] = opt.lr resume_checkpoint['sigma'] = sigma_ckpt['optimizer_state_dict'] rgb_ckpt = torch.load(optim_path_rgb) for g in rgb_ckpt['optimizer_state_dict']['param_groups']: g['lr'] = opt.lr resume_checkpoint['rgb'] = rgb_ckpt['optimizer_state_dict'] resume_checkpoint['total_steps'] = rgb_ckpt['total_steps'] resume_checkpoint['epoch'] = rgb_ckpt['epoch'] except FileNotFoundError: print('Unable to load optimizer checkpoints') else: resume_checkpoint = {} models = {'sigma': model_sigma, 'rgb': model_rgb} # Define the loss loss_fn = partial(loss_functions.tomo_radiance_sigma_rgb_loss, use_piecewise_model=opt.use_piecewise_model, num_cuts=opt.num_cuts) summary_fn = partial(utils.write_tomo_radiance_summary, chunk_size_eval=opt.chunk_size_eval, num_views_to_disp_at_training=1, use_piecewise_model=opt.use_piecewise_model, num_cuts=opt.num_cuts, use_coarse_fine=False) chunk_lists_from_batch_fn = dataio.chunk_lists_from_batch_reduce_to_raysamples_fn # Save command-line parameters log directory. p.write_config_file(opt, [os.path.join(root_path, 'config.ini')]) with open(os.path.join(root_path, "params.txt"), "w") as out_file: out_file.write('\n'.join( ["%s: %s" % (key, value) for key, value in vars(opt).items()])) # Save text summary of model into log directory. with open(os.path.join(root_path, "model.txt"), "w") as out_file: for model_name, model in models.items(): out_file.write(model_name) out_file.write(str(model)) training.train_wchunks(models=models, train_dataloader=dataloader, epochs=opt.num_epochs, lr=opt.lr, steps_til_summary=opt.steps_til_summary, epochs_til_checkpoint=opt.epochs_til_ckpt, model_dir=root_path, loss_fn=loss_fn, summary_fn=summary_fn, val_dataloader=val_dataloader, chunk_lists_from_batch_fn=chunk_lists_from_batch_fn, max_chunk_size=opt.chunk_size_train, num_cuts=opt.num_cuts, clip_grad=opt.clip_grad, resume_checkpoint=resume_checkpoint)
help='Options are "mlp" or "nerf"') p.add_argument('--resolution', type=int, default=1600) opt = p.parse_args() class SDFDecoder(torch.nn.Module): def __init__(self): super().__init__() # Define the model. if opt.mode == 'mlp': self.model = modules.SingleBVPNet(type=opt.model_type, final_layer_factor=1, in_features=3) elif opt.mode == 'nerf': self.model = modules.SingleBVPNet(type='relu', mode='nerf', final_layer_factor=1, in_features=3) #print(datetime.datetime.now()) self.model.load_state_dict(torch.load(opt.checkpoint_path)) #print(datetime.datetime.now()) self.model.cuda() def forward(self, coords): model_in = {'coords': coords} return self.model(model_in)['model_out'] sdf_decoder = SDFDecoder() root_path = os.path.join(opt.logging_root, opt.experiment_name) utils.cond_mkdir(root_path) sdf_meshing.create_mesh(sdf_decoder, os.path.join(root_path, 'test'), N=opt.resolution)
def train(model, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_schedules=None, is_train=True, **kwargs): print('Training Info:') print('data_path:\t\t', kwargs['point_cloud_path']) print('num_instances:\t\t', kwargs['num_instances']) print('batch_size:\t\t', kwargs['batch_size']) print('epochs:\t\t\t', epochs) print('learning rate:\t\t', lr) for key in kwargs: if 'loss' in key: print(key + ':\t', kwargs[key]) if is_train: optim = torch.optim.Adam(lr=lr, params=model.parameters()) else: embedding = model.latent_codes(torch.zeros(1).long().cuda()).clone( ).detach() # initialization for evaluation stage embedding.requires_grad = True optim = torch.optim.Adam(lr=lr, params=[embedding]) if not os.path.isdir(model_dir): os.makedirs(model_dir) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(epochs): if not epoch % epochs_til_checkpoint and epoch: if is_train: torch.save( model.module.state_dict(), os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) else: embed_save = embedding.detach().squeeze().cpu().numpy() np.savetxt( os.path.join(checkpoints_dir, 'embedding_epoch_%04d.txt' % epoch), embed_save) np.savetxt( os.path.join(checkpoints_dir, 'train_losses_epoch_%04d.txt' % epoch), np.array(train_losses)) for step, (model_input, gt) in enumerate(train_dataloader): start_time = time.time() model_input = { key: value.cuda() for key, value in model_input.items() } gt = {key: value.cuda() for key, value in gt.items()} if is_train: losses = model(model_input, gt, **kwargs) else: losses = model.embedding(embedding, model_input, gt) train_loss = 0. for loss_name, loss in losses.items(): single_loss = loss.mean() if loss_schedules is not None and loss_name in loss_schedules: writer.add_scalar( loss_name + "_weight", loss_schedules[loss_name](total_steps), total_steps) single_loss *= loss_schedules[loss_name](total_steps) writer.add_scalar(loss_name, single_loss, total_steps) train_loss += single_loss train_losses.append(train_loss.item()) writer.add_scalar("total_train_loss", train_loss, total_steps) if not total_steps % steps_til_summary: if is_train: torch.save( model.module.state_dict(), os.path.join(checkpoints_dir, 'model_current.pth')) optim.zero_grad() train_loss.backward() optim.step() pbar.update(1) if not total_steps % steps_til_summary: tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) total_steps += 1 if is_train: torch.save(model.module.cpu().state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) else: embed_save = embedding.detach().squeeze().cpu().numpy() np.savetxt( os.path.join(checkpoints_dir, 'embedding_epoch_%04d.txt' % epoch), embed_save) sdf_meshing.create_mesh(model, os.path.join(checkpoints_dir, 'test'), embedding=embedding, N=256, level=0, get_color=False) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))