Example #1
0
def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_data: (dict) training data with keys 'data' and 'labels'
        val_data: (dict) validaion data with keys 'data' and 'labels'
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        
    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        num_steps = (params.train_size + 1) // params.batch_size
        train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True)
        train(model, optimizer, loss_fn, train_data_iterator, metrics, params, num_steps)
            
        # Evaluate for one epoch on validation set
        num_steps = (params.val_size + 1) // params.batch_size
        val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False)
        val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, params, num_steps)
        
        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict': optimizer.state_dict()}, 
                               is_best=is_best,
                               checkpoint=model_dir)
            
        # If best_eval, best_save_path        
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc
            
            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)
Example #2
0
    # load data
    data_loader = DataLoader(args.data_dir, params)
    data = data_loader.load_data(['test'], args.data_dir)
    test_data = data['test']

    # specify the test set size
    params.test_size = test_data['size']
    test_data_iterator = data_loader.data_iterator(test_data, params)

    logging.info("- done.")

    # Define the model
    model = net.Net(params).cuda() if params.cuda else net.Net(params)

    loss_fn = net.loss_fn
    metrics = net.metrics

    logging.info("Starting evaluation")

    # Reload weights from the saved file
    utils.load_checkpoint(
        os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model)

    # Evaluate
    num_steps = (params.test_size + 1) // params.batch_size
    test_metrics = evaluate(model, loss_fn, test_data_iterator, metrics,
                            params, num_steps)
    save_path = os.path.join(args.model_dir,
                             "metrics_test_{}.json".format(args.restore_file))
    utils.save_dict_to_json(test_metrics, save_path)
def main(*args, **kwargs):

    # ---------------------------------
    # Loading the config
    # ---------------------------------
    config_module = importlib.import_module('configs.' + sys.argv[1])
    args = config_module.args
    print(args)

    # ---------------------------------
    # General settings
    # ---------------------------------
    device = 'cuda'
    torch.manual_seed(args.rng_seed)
    torch.cuda.manual_seed(args.rng_seed)
    torch.cuda.manual_seed_all(args.rng_seed)
    np.random.seed(args.rng_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    assert (args.train_type in ['baseline', 'finetune'])
    assert (args.save_opt in ['best', 'last'])

    # ---------------------------------
    # Dataset settings
    # ---------------------------------
    image_size = args.image_size
    batch_size = args.batch_size
    padding = args.padding
    transform_name = args.transform_name

    # ---------------------------------
    # Optimizer and Scheduler settings
    #----------------------------------
    param_types = args.param_types
    max_epoch = args.max_epoch
    optimizer_infos = args.optimizer_infos
    scheduler_infos = args.scheduler_infos

    # ---------------------------------
    # Backbone settings
    # ---------------------------------
    backbone_info = build_backbone_info(args.backbone, 'standard', image_size)

    # ---------------------------------
    # Method settings
    # ---------------------------------
    experiment_dir = 'CHECKPOINTS/Individual/{}/{}/{}'.format(
        args.exp_name, args.backbone, args.dataset)

    if args.pretrain != '':
        assert (args.train_type !=
                'baseline'), 'Cannot use pretrain in baseline train_type'
        print('Load from the pretrained model!')
        model, _ = load_checkpoint(args.pretrain)

    else:
        assert (args.train_type !=
                'finetune'), 'Cannot use finetune train_type without pretrain'
        model = ClassificationNet(backbone_info, args.num_classes)

    # ---------------------------------
    # Build the parallel model
    # ---------------------------------
    model = nn.DataParallel(model.to(device))

    # ---------------------------------
    # Run trainval or evaluate
    # ---------------------------------
    # Build the train and validation dataloaders
    train_loader, val_loader = build_imagedataloaders(
        'trainval', os.path.join(args.exp_name, args.dataset), transform_name,
        image_size, batch_size, padding, args.save_opt, args.workers)

    # Get the checkpoint directory name
    inner_chkpt = args.train_type + args.chkpt_postfix
    checkpoint_dir = os.path.join(experiment_dir, inner_chkpt)

    # Get the optimizers and schedulers
    optimizers = build_optimizers(model.module, param_types, optimizer_infos)
    schedulers = build_schedulers(optimizers, scheduler_infos)

    # Run training and validation loops
    run_trainval(model, args.train_type, args.dataset, max_epoch, device,
                 checkpoint_dir, train_loader, val_loader, optimizers,
                 schedulers, args.save_opt)
    return
def main(*args, **kwargs):

    # ---------------------------------
    # Loading the config
    # ---------------------------------
    config_module = importlib.import_module('configs.' + sys.argv[1])
    args = config_module.args
    print(args)

    # ---------------------------------
    # General settings
    # ---------------------------------
    device = 'cuda'
    torch.manual_seed(args.rng_seed)
    torch.cuda.manual_seed(args.rng_seed)
    torch.cuda.manual_seed_all(args.rng_seed)
    np.random.seed(args.rng_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # ---------------------------------
    # Dataset settings
    # ---------------------------------
    image_size = args.image_size
    batch_size = args.batch_size
    padding = args.padding
    transform_name = args.transform_name

    # ---------------------------------
    # backbone settings
    # ---------------------------------
    backbone_info = build_backbone_info(args.backbone, 'cond', image_size)

    # ---------------------------------
    # Method settings
    # ---------------------------------
    experiment_dir = 'CHECKPOINTS/Continual/{}/{}'.format(
        args.exp_name, args.backbone)
    output_path = 'CHECKPOINTS/Continual/{}/{}/RESULTS_WITHOUT_BOUNDARY.json'.format(
        args.exp_name, args.backbone)

    # ---------------------------------
    # Run evaluation
    # ---------------------------------
    task_dir = find_task_dir_by_idx(experiment_dir, args.final_task_idx)
    chkpt_dir = os.path.join(experiment_dir, task_dir, 'finetune')
    model, manager = load_checkpoint(chkpt_dir)

    # ---------------------------------
    # Random initialization strategy
    # ---------------------------------
    task_dirs = filter(lambda x: x.split('_', 1)[0][:4] == 'Task',
                       os.listdir(experiment_dir))
    task_dirs = sorted(list(task_dirs),
                       key=lambda x: int(x.split('_', 1)[0][4:]))
    num_tasks = len(task_dirs)
    num_total_classes = args.num_classes * num_tasks
    task_class_ids = np.split(np.arange(num_total_classes), num_tasks)

    for index, task_class_idx in enumerate(task_class_ids):
        manager.load_task_exclusive_params(model, index + 1)
        org_cls_state_dict = model.classifier.state_dict()
        model.build_classification_head(num_total_classes)
        new_cls_state_dict = model.classifier.state_dict()

        for name, org_param in org_cls_state_dict.items():
            new_param = new_cls_state_dict[name]
            cls_loc = torch.from_numpy(task_class_idx).long()
            new_param.index_copy_(0, cls_loc, org_param)

        manager.save_task_exclusive_params(model, index + 1)

    # ---------------------------------
    # Run evaluation without boundary
    # ---------------------------------
    task_accs, rough_accs = [], []
    total_corrects = 0
    total_examples = 0
    for dataset_idx, task_dir in enumerate(task_dirs):
        dataset = task_dir.split('_', 1)[1]
        print('Current Dataset: {}'.format(dataset))

        test_loader = build_imagedataloaders(
            'evaluate', os.path.join(args.exp_name, dataset), transform_name,
            image_size, batch_size, padding, args.save_opt, args.workers)
        test_iter = test_loader()
        num_iters = len(test_loader)

        with torch.no_grad():

            # Inference using all tasks
            task_output_list = []
            task_labels_list = []
            for task_idx in range(1, num_tasks + 1):
                manager.load_task_exclusive_params(model, task_idx)
                model.to(device)
                model.eval()

                output_list = []
                labels_list = []
                for batch_idx, batch_data in enumerate(test_iter):
                    sys.stdout.write('Task {}: {}/{}   ..... \r'.format(
                        task_idx, batch_idx + 1, num_iters))
                    sys.stdout.flush()
                    images, labels = batch_data
                    images = images.to(device)
                    labels = labels.to(device) + dataset_idx * args.num_classes
                    output = model(images)
                    output_list.append(output.cpu().numpy())
                    labels_list.append(labels.cpu().numpy())

                task_output_list.append(np.concatenate(output_list, 0))
                task_labels_list.append(np.concatenate(labels_list, 0))
                print()

            # Decide final predictions
            argmax_probs = np.argmax(np.concatenate(task_output_list, 1), 1)
            num_rough = np.sum((argmax_probs //
                                num_total_classes) == dataset_idx)
            predis = argmax_probs % num_total_classes
            labels = task_labels_list[-1]
            num_corrects = np.sum(predis == labels)
            num_examples = labels.shape[0]
            task_accs.append(num_corrects / num_examples)
            rough_accs.append(num_rough / num_examples)
            total_corrects += num_corrects
            total_examples += num_examples

    content = {}
    for index, task_acc in enumerate(task_accs):
        print('Task {} Acc: {:.4f}, ({:.4f})'.format(index + 1, task_acc,
                                                     rough_accs[index]))
    content['Task_Acc'] = [round(x, 2) for x in task_accs]
    content['Rough_Acc'] = [round(x, 2) for x in rough_accs]

    final_acc = total_corrects / total_examples
    print('Final Acc: {:.4f}'.format(final_acc))
    content['Final_Acc'] = round(final_acc, 2)

    with open(output_path, 'w') as f:
        json.dump(content, f)
    return
Example #5
0
    random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Create a directory to store weights
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Create model
    if args.model == 'WDSR-B':
        model = WDSR_B(args).to(device)
    else:
        model = WDSR_A(args).to(device)

    print_information(model, args)
    model = load_weights(model,
                         load_checkpoint(args.checkpoint_file)['state_dict'])

    # Define loss function and optimizer
    criterion = nn.L1Loss()
    optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                  model.parameters()),
                           lr=args.lr)

    # Prepare dataset
    # train_dataset = DIV2K(args, train=True)
    # valid_dataset = DIV2K(args, train=False)

    train_dataset = SRDataset("train")
    valid_dataset = SRDataset("valid")

    train_dataloader = DataLoader(dataset=train_dataset,
def main(*args, **kwargs):

    # ---------------------------------
    # Loading the config
    # ---------------------------------
    config_module = importlib.import_module('configs.' + sys.argv[1])
    args = config_module.args
    print(args)

    # ---------------------------------
    # General settings
    # ---------------------------------
    device = 'cuda'
    torch.manual_seed(args.rng_seed)
    torch.cuda.manual_seed(args.rng_seed)
    torch.cuda.manual_seed_all(args.rng_seed)
    np.random.seed(args.rng_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # ---------------------------------
    # Dataset settings
    # ---------------------------------
    image_size = args.image_size
    batch_size = args.batch_size
    padding = args.padding
    transform_name = args.transform_name

    # ---------------------------------
    # backbone settings
    # ---------------------------------
    backbone_info = build_backbone_info(args.backbone, 'cond', image_size)

    # ---------------------------------
    # Method settings
    # ---------------------------------
    experiment_dir = 'CHECKPOINTS/Continual/{}/{}'.format(
        args.exp_name, args.backbone)
    output_path = 'CHECKPOINTS/Continual/{}/{}/RESULTS_WITH_BOUNDARY.json'.format(
        args.exp_name, args.backbone)

    # ---------------------------------
    # Run evaluation
    # ---------------------------------
    task_dir = find_task_dir_by_idx(experiment_dir, args.final_task_idx)
    chkpt_dir = os.path.join(experiment_dir, task_dir, 'finetune')
    model, manager = load_checkpoint(chkpt_dir)
    manager.load_task_exclusive_params(model, args.task_idx)

    model = nn.DataParallel(model.to(device))

    test_loader = build_imagedataloaders(
        'evaluate', os.path.join(args.exp_name, args.dataset), transform_name,
        image_size, batch_size, padding, args.save_opt, args.workers)

    val_loss, val_acc = test_epoch(model, device, test_loader, -1)

    if os.path.exists(output_path):
        with open(output_path, 'r') as f:
            content = json.load(f)
    else:
        content = {}

    content['Task{}_{}'.format(args.task_idx,
                               args.dataset)] = round(val_acc, 2)

    with open(output_path, 'w') as f:
        json.dump(content, f)
    return
Example #7
0
            t.update(lr.shape[0])

    print('DIV2K (val) PSNR: {:.4f} dB'.format(psnr.avg))


if __name__ == '__main__':
    # Define specific options and parse arguments
    parser.add_argument('--dataset-dir', type=str, required=True, help='DIV2K Dataset Root Directory')
    parser.add_argument('--checkpoint-file', type=str, required=True)
    parser.add_argument('--self-ensemble', action='store_true')
    args = parser.parse_args()

    # Set cuDNN auto-tuner and get device
    cudnn.benchmark = True
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Create model
    if args.model == 'WDSR-B':
        model = WDSR_B(args).to(device)
    else:
        model = WDSR_A(args).to(device)

    # Load weights
    model = load_weights(model, load_checkpoint(args.checkpoint_file)['state_dict'])

    # Prepare dataset
    dataset = DIV2K(args, train=False)
    dataloader = DataLoader(dataset=dataset, batch_size=1)

    test(dataset, dataloader, model, device, args)
Example #8
0
def main(*args, **kwargs):

    # ---------------------------------
    # Loading the config
    # ---------------------------------
    config_module = importlib.import_module('configs.' + sys.argv[1])
    args = config_module.args
    print(args)

    # ---------------------------------
    # General settings
    # ---------------------------------
    device = 'cuda'
    torch.manual_seed(args.rng_seed)
    torch.cuda.manual_seed(args.rng_seed)
    torch.cuda.manual_seed_all(args.rng_seed)
    np.random.seed(args.rng_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    assert (args.save_opt in ['best', 'last'])

    # ---------------------------------
    # Dataset settings
    # ---------------------------------
    image_size = args.image_size
    batch_size = args.batch_size
    padding = args.padding
    transform_name = args.transform_name

    # ---------------------------------
    # Optimizer and Scheduler settings
    # ---------------------------------
    param_types = args.param_types
    max_epoch = args.max_epoch
    optimizer_infos = args.optimizer_infos
    scheduler_infos = args.scheduler_infos

    # ---------------------------------
    # Backbone settings
    # ---------------------------------
    backbone_info = build_backbone_info(args.backbone, 'cond', image_size)

    # ---------------------------------
    # Method settings
    # ---------------------------------
    experiment_dir = 'CHECKPOINTS/Continual/{}/{}'.format(
        args.exp_name, args.backbone)

    if args.task_idx == 1:

        # Convert the scratch model with standard conv to cond conv
        source_chkpt_dir = 'CHECKPOINTS/Individual/{}/{}/{}/baseline'.format(
            args.exp_name, args.backbone, args.dataset)
        target_chkpt_dir = os.path.join(
            experiment_dir, 'Task{}_{}'.format(args.task_idx, args.dataset),
            'finetune')
        convert_standardconv_to_condconv(source_chkpt_dir, target_chkpt_dir,
                                         args.task_idx, args.dataset)
        return  # No need training after conversion

    else:

        # Load the model from the previous task
        prev_task_dir = find_task_dir_by_idx(experiment_dir, args.task_idx - 1)
        prev_chkpt_dir = os.path.join(experiment_dir, prev_task_dir,
                                      'finetune')
        model, manager = load_checkpoint(prev_chkpt_dir)
        manager.rebuild_structure_with_expansion(
            model,
            args.task_idx,
            num_classes=args.num_classes,
            zero_init_expand=args.zero_init_expand)

    # ---------------------------------
    # Build the parallel model
    # ---------------------------------
    model = nn.DataParallel(model.to(device))

    # ---------------------------------
    # Run trainval or evaluate
    # ---------------------------------
    # Build the train and validation dataloaders
    train_loader, val_loader = build_imagedataloaders(
        'trainval', os.path.join(args.exp_name, args.dataset), transform_name,
        image_size, batch_size, padding, args.save_opt, args.workers)

    # Get the checkpoint directory name
    checkpoint_dir = os.path.join(
        experiment_dir, 'Task{}_{}'.format(args.task_idx, args.dataset),
        'finetune')

    # Get the optimizers and schedulers
    optimizers = build_optimizers(model.module,
                                  param_types,
                                  optimizer_infos,
                                  manager=manager,
                                  task_idx=args.task_idx)
    schedulers = build_schedulers(optimizers, scheduler_infos)

    # Run the training validation
    run_trainval(model, manager, args.task_idx, args.dataset, max_epoch,
                 device, checkpoint_dir, train_loader, val_loader, optimizers,
                 schedulers, args.save_opt)
    return