def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_data: (dict) training data with keys 'data' and 'labels' val_data: (dict) validaion data with keys 'data' and 'labels' optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + 1) // params.batch_size train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True) train(model, optimizer, loss_fn, train_data_iterator, metrics, params, num_steps) # Evaluate for one epoch on validation set num_steps = (params.val_size + 1) // params.batch_size val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False) val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, params, num_steps) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
# load data data_loader = DataLoader(args.data_dir, params) data = data_loader.load_data(['test'], args.data_dir) test_data = data['test'] # specify the test set size params.test_size = test_data['size'] test_data_iterator = data_loader.data_iterator(test_data, params) logging.info("- done.") # Define the model model = net.Net(params).cuda() if params.cuda else net.Net(params) loss_fn = net.loss_fn metrics = net.metrics logging.info("Starting evaluation") # Reload weights from the saved file utils.load_checkpoint( os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model) # Evaluate num_steps = (params.test_size + 1) // params.batch_size test_metrics = evaluate(model, loss_fn, test_data_iterator, metrics, params, num_steps) save_path = os.path.join(args.model_dir, "metrics_test_{}.json".format(args.restore_file)) utils.save_dict_to_json(test_metrics, save_path)
def main(*args, **kwargs): # --------------------------------- # Loading the config # --------------------------------- config_module = importlib.import_module('configs.' + sys.argv[1]) args = config_module.args print(args) # --------------------------------- # General settings # --------------------------------- device = 'cuda' torch.manual_seed(args.rng_seed) torch.cuda.manual_seed(args.rng_seed) torch.cuda.manual_seed_all(args.rng_seed) np.random.seed(args.rng_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False assert (args.train_type in ['baseline', 'finetune']) assert (args.save_opt in ['best', 'last']) # --------------------------------- # Dataset settings # --------------------------------- image_size = args.image_size batch_size = args.batch_size padding = args.padding transform_name = args.transform_name # --------------------------------- # Optimizer and Scheduler settings #---------------------------------- param_types = args.param_types max_epoch = args.max_epoch optimizer_infos = args.optimizer_infos scheduler_infos = args.scheduler_infos # --------------------------------- # Backbone settings # --------------------------------- backbone_info = build_backbone_info(args.backbone, 'standard', image_size) # --------------------------------- # Method settings # --------------------------------- experiment_dir = 'CHECKPOINTS/Individual/{}/{}/{}'.format( args.exp_name, args.backbone, args.dataset) if args.pretrain != '': assert (args.train_type != 'baseline'), 'Cannot use pretrain in baseline train_type' print('Load from the pretrained model!') model, _ = load_checkpoint(args.pretrain) else: assert (args.train_type != 'finetune'), 'Cannot use finetune train_type without pretrain' model = ClassificationNet(backbone_info, args.num_classes) # --------------------------------- # Build the parallel model # --------------------------------- model = nn.DataParallel(model.to(device)) # --------------------------------- # Run trainval or evaluate # --------------------------------- # Build the train and validation dataloaders train_loader, val_loader = build_imagedataloaders( 'trainval', os.path.join(args.exp_name, args.dataset), transform_name, image_size, batch_size, padding, args.save_opt, args.workers) # Get the checkpoint directory name inner_chkpt = args.train_type + args.chkpt_postfix checkpoint_dir = os.path.join(experiment_dir, inner_chkpt) # Get the optimizers and schedulers optimizers = build_optimizers(model.module, param_types, optimizer_infos) schedulers = build_schedulers(optimizers, scheduler_infos) # Run training and validation loops run_trainval(model, args.train_type, args.dataset, max_epoch, device, checkpoint_dir, train_loader, val_loader, optimizers, schedulers, args.save_opt) return
def main(*args, **kwargs): # --------------------------------- # Loading the config # --------------------------------- config_module = importlib.import_module('configs.' + sys.argv[1]) args = config_module.args print(args) # --------------------------------- # General settings # --------------------------------- device = 'cuda' torch.manual_seed(args.rng_seed) torch.cuda.manual_seed(args.rng_seed) torch.cuda.manual_seed_all(args.rng_seed) np.random.seed(args.rng_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # --------------------------------- # Dataset settings # --------------------------------- image_size = args.image_size batch_size = args.batch_size padding = args.padding transform_name = args.transform_name # --------------------------------- # backbone settings # --------------------------------- backbone_info = build_backbone_info(args.backbone, 'cond', image_size) # --------------------------------- # Method settings # --------------------------------- experiment_dir = 'CHECKPOINTS/Continual/{}/{}'.format( args.exp_name, args.backbone) output_path = 'CHECKPOINTS/Continual/{}/{}/RESULTS_WITHOUT_BOUNDARY.json'.format( args.exp_name, args.backbone) # --------------------------------- # Run evaluation # --------------------------------- task_dir = find_task_dir_by_idx(experiment_dir, args.final_task_idx) chkpt_dir = os.path.join(experiment_dir, task_dir, 'finetune') model, manager = load_checkpoint(chkpt_dir) # --------------------------------- # Random initialization strategy # --------------------------------- task_dirs = filter(lambda x: x.split('_', 1)[0][:4] == 'Task', os.listdir(experiment_dir)) task_dirs = sorted(list(task_dirs), key=lambda x: int(x.split('_', 1)[0][4:])) num_tasks = len(task_dirs) num_total_classes = args.num_classes * num_tasks task_class_ids = np.split(np.arange(num_total_classes), num_tasks) for index, task_class_idx in enumerate(task_class_ids): manager.load_task_exclusive_params(model, index + 1) org_cls_state_dict = model.classifier.state_dict() model.build_classification_head(num_total_classes) new_cls_state_dict = model.classifier.state_dict() for name, org_param in org_cls_state_dict.items(): new_param = new_cls_state_dict[name] cls_loc = torch.from_numpy(task_class_idx).long() new_param.index_copy_(0, cls_loc, org_param) manager.save_task_exclusive_params(model, index + 1) # --------------------------------- # Run evaluation without boundary # --------------------------------- task_accs, rough_accs = [], [] total_corrects = 0 total_examples = 0 for dataset_idx, task_dir in enumerate(task_dirs): dataset = task_dir.split('_', 1)[1] print('Current Dataset: {}'.format(dataset)) test_loader = build_imagedataloaders( 'evaluate', os.path.join(args.exp_name, dataset), transform_name, image_size, batch_size, padding, args.save_opt, args.workers) test_iter = test_loader() num_iters = len(test_loader) with torch.no_grad(): # Inference using all tasks task_output_list = [] task_labels_list = [] for task_idx in range(1, num_tasks + 1): manager.load_task_exclusive_params(model, task_idx) model.to(device) model.eval() output_list = [] labels_list = [] for batch_idx, batch_data in enumerate(test_iter): sys.stdout.write('Task {}: {}/{} ..... \r'.format( task_idx, batch_idx + 1, num_iters)) sys.stdout.flush() images, labels = batch_data images = images.to(device) labels = labels.to(device) + dataset_idx * args.num_classes output = model(images) output_list.append(output.cpu().numpy()) labels_list.append(labels.cpu().numpy()) task_output_list.append(np.concatenate(output_list, 0)) task_labels_list.append(np.concatenate(labels_list, 0)) print() # Decide final predictions argmax_probs = np.argmax(np.concatenate(task_output_list, 1), 1) num_rough = np.sum((argmax_probs // num_total_classes) == dataset_idx) predis = argmax_probs % num_total_classes labels = task_labels_list[-1] num_corrects = np.sum(predis == labels) num_examples = labels.shape[0] task_accs.append(num_corrects / num_examples) rough_accs.append(num_rough / num_examples) total_corrects += num_corrects total_examples += num_examples content = {} for index, task_acc in enumerate(task_accs): print('Task {} Acc: {:.4f}, ({:.4f})'.format(index + 1, task_acc, rough_accs[index])) content['Task_Acc'] = [round(x, 2) for x in task_accs] content['Rough_Acc'] = [round(x, 2) for x in rough_accs] final_acc = total_corrects / total_examples print('Final Acc: {:.4f}'.format(final_acc)) content['Final_Acc'] = round(final_acc, 2) with open(output_path, 'w') as f: json.dump(content, f) return
random.seed(args.seed) torch.manual_seed(args.seed) # Create a directory to store weights if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Create model if args.model == 'WDSR-B': model = WDSR_B(args).to(device) else: model = WDSR_A(args).to(device) print_information(model, args) model = load_weights(model, load_checkpoint(args.checkpoint_file)['state_dict']) # Define loss function and optimizer criterion = nn.L1Loss() optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=args.lr) # Prepare dataset # train_dataset = DIV2K(args, train=True) # valid_dataset = DIV2K(args, train=False) train_dataset = SRDataset("train") valid_dataset = SRDataset("valid") train_dataloader = DataLoader(dataset=train_dataset,
def main(*args, **kwargs): # --------------------------------- # Loading the config # --------------------------------- config_module = importlib.import_module('configs.' + sys.argv[1]) args = config_module.args print(args) # --------------------------------- # General settings # --------------------------------- device = 'cuda' torch.manual_seed(args.rng_seed) torch.cuda.manual_seed(args.rng_seed) torch.cuda.manual_seed_all(args.rng_seed) np.random.seed(args.rng_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # --------------------------------- # Dataset settings # --------------------------------- image_size = args.image_size batch_size = args.batch_size padding = args.padding transform_name = args.transform_name # --------------------------------- # backbone settings # --------------------------------- backbone_info = build_backbone_info(args.backbone, 'cond', image_size) # --------------------------------- # Method settings # --------------------------------- experiment_dir = 'CHECKPOINTS/Continual/{}/{}'.format( args.exp_name, args.backbone) output_path = 'CHECKPOINTS/Continual/{}/{}/RESULTS_WITH_BOUNDARY.json'.format( args.exp_name, args.backbone) # --------------------------------- # Run evaluation # --------------------------------- task_dir = find_task_dir_by_idx(experiment_dir, args.final_task_idx) chkpt_dir = os.path.join(experiment_dir, task_dir, 'finetune') model, manager = load_checkpoint(chkpt_dir) manager.load_task_exclusive_params(model, args.task_idx) model = nn.DataParallel(model.to(device)) test_loader = build_imagedataloaders( 'evaluate', os.path.join(args.exp_name, args.dataset), transform_name, image_size, batch_size, padding, args.save_opt, args.workers) val_loss, val_acc = test_epoch(model, device, test_loader, -1) if os.path.exists(output_path): with open(output_path, 'r') as f: content = json.load(f) else: content = {} content['Task{}_{}'.format(args.task_idx, args.dataset)] = round(val_acc, 2) with open(output_path, 'w') as f: json.dump(content, f) return
t.update(lr.shape[0]) print('DIV2K (val) PSNR: {:.4f} dB'.format(psnr.avg)) if __name__ == '__main__': # Define specific options and parse arguments parser.add_argument('--dataset-dir', type=str, required=True, help='DIV2K Dataset Root Directory') parser.add_argument('--checkpoint-file', type=str, required=True) parser.add_argument('--self-ensemble', action='store_true') args = parser.parse_args() # Set cuDNN auto-tuner and get device cudnn.benchmark = True device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Create model if args.model == 'WDSR-B': model = WDSR_B(args).to(device) else: model = WDSR_A(args).to(device) # Load weights model = load_weights(model, load_checkpoint(args.checkpoint_file)['state_dict']) # Prepare dataset dataset = DIV2K(args, train=False) dataloader = DataLoader(dataset=dataset, batch_size=1) test(dataset, dataloader, model, device, args)
def main(*args, **kwargs): # --------------------------------- # Loading the config # --------------------------------- config_module = importlib.import_module('configs.' + sys.argv[1]) args = config_module.args print(args) # --------------------------------- # General settings # --------------------------------- device = 'cuda' torch.manual_seed(args.rng_seed) torch.cuda.manual_seed(args.rng_seed) torch.cuda.manual_seed_all(args.rng_seed) np.random.seed(args.rng_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False assert (args.save_opt in ['best', 'last']) # --------------------------------- # Dataset settings # --------------------------------- image_size = args.image_size batch_size = args.batch_size padding = args.padding transform_name = args.transform_name # --------------------------------- # Optimizer and Scheduler settings # --------------------------------- param_types = args.param_types max_epoch = args.max_epoch optimizer_infos = args.optimizer_infos scheduler_infos = args.scheduler_infos # --------------------------------- # Backbone settings # --------------------------------- backbone_info = build_backbone_info(args.backbone, 'cond', image_size) # --------------------------------- # Method settings # --------------------------------- experiment_dir = 'CHECKPOINTS/Continual/{}/{}'.format( args.exp_name, args.backbone) if args.task_idx == 1: # Convert the scratch model with standard conv to cond conv source_chkpt_dir = 'CHECKPOINTS/Individual/{}/{}/{}/baseline'.format( args.exp_name, args.backbone, args.dataset) target_chkpt_dir = os.path.join( experiment_dir, 'Task{}_{}'.format(args.task_idx, args.dataset), 'finetune') convert_standardconv_to_condconv(source_chkpt_dir, target_chkpt_dir, args.task_idx, args.dataset) return # No need training after conversion else: # Load the model from the previous task prev_task_dir = find_task_dir_by_idx(experiment_dir, args.task_idx - 1) prev_chkpt_dir = os.path.join(experiment_dir, prev_task_dir, 'finetune') model, manager = load_checkpoint(prev_chkpt_dir) manager.rebuild_structure_with_expansion( model, args.task_idx, num_classes=args.num_classes, zero_init_expand=args.zero_init_expand) # --------------------------------- # Build the parallel model # --------------------------------- model = nn.DataParallel(model.to(device)) # --------------------------------- # Run trainval or evaluate # --------------------------------- # Build the train and validation dataloaders train_loader, val_loader = build_imagedataloaders( 'trainval', os.path.join(args.exp_name, args.dataset), transform_name, image_size, batch_size, padding, args.save_opt, args.workers) # Get the checkpoint directory name checkpoint_dir = os.path.join( experiment_dir, 'Task{}_{}'.format(args.task_idx, args.dataset), 'finetune') # Get the optimizers and schedulers optimizers = build_optimizers(model.module, param_types, optimizer_infos, manager=manager, task_idx=args.task_idx) schedulers = build_schedulers(optimizers, scheduler_infos) # Run the training validation run_trainval(model, manager, args.task_idx, args.dataset, max_epoch, device, checkpoint_dir, train_loader, val_loader, optimizers, schedulers, args.save_opt) return