Ejemplo n.º 1
0
def main():
    parser = U.get_argparser()
    args = parser.parse_args()
    U.set_manual_seed(args.seed)

    train_session_args = vars(args)
    train_session = U.get_random_name()
    current_time = datetime.now().strftime('%b%d_%H_%M')
    prefix = f'{current_time}_{args.model}_{args.prepare}_{args.augmentation}_{train_session}'
    if args.fold is not None:
        prefix += f'_fold_{args.stratify}_{args.fold}'

    log_dir = os.path.join('runs', prefix)
    exp_dir = os.path.join('experiments', args.model, args.prepare,
                           args.augmentation, prefix)
    os.makedirs(exp_dir, exist_ok=True)

    train_ids = D.all_train_ids()
    depths = D.read_depths(train_ids)
    images = D.read_train_images(train_ids)
    masks = D.read_train_masks(train_ids)

    if args.fix_masks:
        masks, changed_ids = D.fix_masks(masks, train_ids)
        with open(os.path.join(exp_dir, 'fixed_masks.txt'), 'w') as f:
            for sample_id in changed_ids:
                f.write(sample_id)
                f.write('\n')
        print(f'Fixed {len(changed_ids)} masks')

    if args.fold is not None:
        train_indexes, test_indexes = D.get_train_test_split_for_fold(
            args.stratify, args.fold, train_ids)
    else:
        train_indexes, test_indexes = train_test_split(
            np.arange(len(train_ids)),
            shuffle=False,
            random_state=args.split_seed,
            test_size=0.2)

    ids_train, ids_test = train_ids[train_indexes], train_ids[test_indexes]
    img_train, img_test = images[train_indexes], images[test_indexes]
    mask_train, mask_test = masks[train_indexes], masks[test_indexes]
    depth_train, depth_test = depths[train_indexes], depths[test_indexes]

    # Here we can exclude some images from training, but keep in validation
    train_mask = D.drop_some(img_train,
                             mask_train,
                             drop_black=True,
                             drop_vstrips=args.drop_vstrips,
                             drop_few=args.drop_few)
    ids_train = ids_train[train_mask]
    img_train = img_train[train_mask]
    mask_train = mask_train[train_mask]
    depth_train = depth_train[train_mask]

    if not is_sorted(ids_train):
        raise RuntimeError("ids_train is not sorted")
    if not is_sorted(ids_test):
        raise RuntimeError("ids_test_sorted is not sorted")

    prepare_fn = D.get_prepare_fn(args.prepare, **train_session_args)

    # This line valid if we apply prepare_fn first and then do augmentation
    target_size = prepare_fn.target_size if prepare_fn is not None else D.ORIGINAL_SIZE
    # target_size = D.ORIGINAL_SIZE

    build_augmentation_fn = D.AUGMENTATION_MODES[args.augmentation]
    aug = build_augmentation_fn(target_size, border_mode=args.border_mode)

    train_transform_list = []
    valid_transform_list = []
    if prepare_fn is not None:
        train_transform_list.append(prepare_fn.t_forward)
        valid_transform_list.append(prepare_fn.t_forward)

    train_transform_list.append(aug)

    trainset = D.ImageAndMaskDataset(ids_train,
                                     img_train,
                                     mask_train,
                                     depth_train,
                                     augment=A.Compose(train_transform_list))

    validset = D.ImageAndMaskDataset(ids_test,
                                     img_test,
                                     mask_test,
                                     depth_test,
                                     augment=A.Compose(valid_transform_list))

    trainloader = DataLoader(trainset,
                             batch_size=args.batch_size,
                             num_workers=args.workers,
                             pin_memory=True,
                             drop_last=True,
                             shuffle=True)

    validloader = DataLoader(validset,
                             batch_size=args.batch_size,
                             pin_memory=True,
                             drop_last=False,
                             shuffle=False)

    # Save train/val split for future use
    train_session_args.update({
        'train_set': list(ids_train),
        'valid_set': list(ids_test)
    })

    # Declare variables we will use during training
    start_epoch = 0
    train_history = pd.DataFrame()

    target_metric = args.target_metric
    target_metric_mode = 'max'
    best_metric_val = 0
    best_lb_checkpoint = os.path.join(exp_dir, f'{prefix}_{target_metric}.pth')

    model = U.get_model(args.model,
                        num_classes=args.num_classes,
                        num_channels=trainset.channels(),
                        abn=args.abn,
                        use_dropout=not args.no_dropout,
                        pretrained=not args.no_pretrain).cuda()

    print('Train set size :', len(ids_train), 'batch size',
          trainloader.batch_size)
    print('Valid set size :', len(ids_test), 'batch size',
          validloader.batch_size)
    print('Tile transform :', prepare_fn if prepare_fn is not None else "None")
    print('Model          :', args.model, count_parameters(model))
    print('Augmentations  :', args.augmentation, args.border_mode)
    print('Input channels :', trainset.channels())
    print('Output classes :', args.num_classes)
    print('Optimizer      :', args.optimizer, 'wd', args.weight_decay)
    print('Use of dropout :', not args.no_dropout)
    print('Train session  :', train_session)
    print('Freeze encoder :', args.freeze_encoder)
    print('Seed           :', args.seed, args.split_seed)
    print('Restart every  :', args.restart_every)
    print('Fold           :', args.fold, args.stratify)
    print('Fine-tune      :', args.fine_tune)
    print('ABN Mode       :', args.abn)
    print('Fix masks      :', args.fix_masks)

    if args.resume:
        fname = U.auto_file(args.resume)
        start_epoch, train_history, best_score = U.restore_checkpoint(
            fname, model)
        print(train_history)
        print('Resuming training from epoch', start_epoch, ' and score',
              best_score, args.resume)

    if args.fine_tune and args.freeze_encoder > 0:
        raise ValueError(
            'Incompatible options --fune-tune and --freeze-encoder')

    writer = SummaryWriter(log_dir)
    writer.add_text('train/params',
                    '```' + json.dumps(train_session_args, indent=2) + '```',
                    0)

    config_fname = os.path.join(exp_dir, f'{train_session}.json')
    with open(config_fname, 'w') as f:
        f.write(json.dumps(train_session_args, indent=2))

    weights = {
        'mask': 1.0,
        'class': 0.05,
        'dsv': 0.1,
    }

    bce = U.get_loss('bce')
    bce_lovasz = U.get_loss('bce_lovasz')
    bce_jaccard = U.get_loss('bce_jaccard')

    losses = {
        'warmup': {
            'mask': bce,
            'class': bce,
            'dsv': bce,
        },
        'main': {
            'mask': bce_jaccard,
            'class': bce,
            'dsv': bce,
        },
        'annealing': {
            'mask': bce_lovasz,
            'class': bce,
            'dsv': bce,
        }
    }

    epochs = {'warmup': 50, 'main': 250, 'annealing': 50}

    if args.fast:
        for key in epochs.keys():
            epochs[key] = 1

    learning_rates = {
        'warmup': args.learning_rate,
        'main': 1e-3,
        'annealing': 1e-2
    }

    # Warmup phase
    if epochs['warmup']:
        print(torch.cuda.max_memory_allocated(),
              torch.cuda.max_memory_cached())
        trainable_parameters = filter(lambda p: p.requires_grad,
                                      model.parameters())
        optimizer = U.get_optimizer(args.optimizer,
                                    trainable_parameters,
                                    learning_rates['warmup'],
                                    weight_decay=args.weight_decay)
        scheduler = None  # StepLR(optimizer, gamma=0.5, step_size=50)

        train_history, best_metric_val, start_epoch = train(
            model,
            losses['warmup'],
            weights,
            optimizer,
            scheduler,
            trainloader,
            validloader,
            writer,
            start_epoch,
            epochs=epochs['warmup'],
            early_stopping=args.early_stopping,
            train_history=train_history,
            experiment_dir=exp_dir,
            target_metric=target_metric,
            best_metric_val=best_metric_val,
            target_metric_mode=target_metric_mode,
            checkpoint_filename=best_lb_checkpoint)
        U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_warmup.pth'),
                          model,
                          start_epoch,
                          train_history,
                          metric_name=target_metric,
                          metric_score=best_metric_val)

        del trainable_parameters, optimizer, scheduler
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

        print('Finished warmup phase. Main train loop.')

    # Main training phase
    print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached())
    trainable_parameters = filter(lambda p: p.requires_grad,
                                  model.parameters())
    optimizer = U.get_optimizer(args.optimizer,
                                trainable_parameters,
                                learning_rates['main'],
                                weight_decay=args.weight_decay)
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode='max',
                                  patience=50,
                                  factor=0.5,
                                  min_lr=1e-5)

    train_history, best_metric_val, start_epoch = train(
        model,
        losses['main'],
        weights,
        optimizer,
        scheduler,
        trainloader,
        validloader,
        writer,
        start_epoch,
        epochs=epochs['main'],
        early_stopping=args.early_stopping,
        train_history=train_history,
        experiment_dir=exp_dir,
        target_metric=target_metric,
        best_metric_val=best_metric_val,
        target_metric_mode=target_metric_mode,
        checkpoint_filename=best_lb_checkpoint)
    del trainable_parameters, optimizer, scheduler
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    snapshots = [best_lb_checkpoint]

    U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_main.pth'),
                      model,
                      start_epoch,
                      train_history,
                      metric_name=target_metric,
                      metric_score=best_metric_val)

    print('Finished train phase.')

    # Cosine annealing
    if epochs['annealing']:

        for snapshot in range(5):
            print(f'Starting annealing phase {snapshot}')
            print(torch.cuda.max_memory_allocated(),
                  torch.cuda.max_memory_cached())
            # model.set_fine_tune(True)
            trainable_parameters = filter(lambda p: p.requires_grad,
                                          model.parameters())
            optimizer = U.get_optimizer('sgd',
                                        trainable_parameters,
                                        learning_rates['annealing'],
                                        weight_decay=args.weight_decay)
            scheduler = CosineAnnealingLR(optimizer,
                                          epochs['annealing'],
                                          eta_min=1e-7)

            snapshot_name = os.path.join(
                exp_dir, f'{prefix}_{target_metric}_snapshot_{snapshot}.pth')
            snapshots.append(snapshot_name)
            train_history, best_metric_val, start_epoch = train(
                model,
                losses['annealing'],
                weights,
                optimizer,
                scheduler,
                trainloader,
                validloader,
                writer,
                start_epoch,
                epochs=epochs['annealing'],
                early_stopping=args.early_stopping,
                train_history=train_history,
                experiment_dir=exp_dir,
                target_metric=target_metric,
                best_metric_val=0,
                target_metric_mode=target_metric_mode,
                checkpoint_filename=snapshot_name)
            del trainable_parameters, optimizer, scheduler
            torch.cuda.empty_cache()
            torch.cuda.synchronize()

    print('Training finished')
    train_history.to_csv(os.path.join(exp_dir, 'train_history.csv'),
                         index=False)

    for snapshot_file in snapshots:
        generate_model_submission(snapshot_file,
                                  config_fname,
                                  mine_on_val=True)
Ejemplo n.º 2
0
def main():
    parser = U.get_argparser()
    args = parser.parse_args()
    U.set_manual_seed(args.seed)

    train_session_args = vars(args)
    train_session = U.get_random_name()
    current_time = datetime.now().strftime('%b%d_%H_%M')
    prefix = f'{current_time}_{args.model}_{args.prepare}_{args.augmentation}_{train_session}'
    if args.fold is not None:
        prefix += f'_fold_{args.stratify}_{args.fold}'

    log_dir = os.path.join('runs', prefix)
    exp_dir = os.path.join('experiments', args.model, args.prepare,
                           args.augmentation, prefix)
    os.makedirs(exp_dir, exist_ok=True)

    train_ids = D.get_train_ids(drop_black=True,
                                drop_vstrips=args.drop_vstrips,
                                drop_empty=args.drop_empty,
                                drop_few=args.drop_few,
                                fast=args.fast)
    depths = D.read_depths(train_ids)
    images = D.read_train_images(train_ids)
    masks = D.read_train_masks(train_ids)

    if args.fix_masks:
        masks, changed_ids = D.fix_masks(masks, train_ids)
        with open(os.path.join(exp_dir, 'fixed_masks.txt'), 'w') as f:
            for sample_id in changed_ids:
                f.write(sample_id)
                f.write('\n')
        print(f'Fixed {len(changed_ids)} masks')

    if args.fold is not None:
        train_indexes, test_indexes = D.get_train_test_split_for_fold(
            args.stratify, args.fold, train_ids)
    else:
        train_indexes, test_indexes = train_test_split(
            np.arange(len(train_ids)),
            shuffle=False,
            random_state=args.split_seed,
            test_size=0.2)

    ids_train, ids_test = train_ids[train_indexes], train_ids[test_indexes]
    if not is_sorted(ids_train):
        raise RuntimeError("ids_train is not sorted")
    if not is_sorted(ids_test):
        raise RuntimeError("ids_test_sorted is not sorted")

    img_train, img_test = images[train_indexes], images[test_indexes]
    mask_train, mask_test = masks[train_indexes], masks[test_indexes]
    depth_train, depth_test = depths[train_indexes], depths[test_indexes]

    prepare_fn = D.get_prepare_fn(args.prepare, **train_session_args)

    # This line valid if we apply prepare_fn first and then do augmentation
    target_size = prepare_fn.target_size if prepare_fn is not None else D.ORIGINAL_SIZE
    # target_size = D.ORIGINAL_SIZE

    build_augmentation_fn = D.AUGMENTATION_MODES[args.augmentation]
    aug = build_augmentation_fn(target_size, border_mode=args.border_mode)

    train_transform_list = []
    valid_transform_list = []
    if prepare_fn is not None:
        train_transform_list.append(prepare_fn.t_forward)
        valid_transform_list.append(prepare_fn.t_forward)

    train_transform_list.append(aug)

    trainset = D.ImageAndMaskDataset(ids_train,
                                     img_train,
                                     mask_train,
                                     depth_train,
                                     augment=A.Compose(train_transform_list))

    validset = D.ImageAndMaskDataset(ids_test,
                                     img_test,
                                     mask_test,
                                     depth_test,
                                     augment=A.Compose(valid_transform_list))

    trainloader = DataLoader(trainset,
                             batch_size=args.batch_size,
                             num_workers=args.workers,
                             pin_memory=True,
                             drop_last=True,
                             shuffle=True)

    validloader = DataLoader(validset,
                             batch_size=args.batch_size,
                             pin_memory=True,
                             drop_last=False,
                             shuffle=False)

    # Save train/val split for future use
    train_session_args.update({
        'train_set': list(ids_train),
        'valid_set': list(ids_test)
    })

    # Declare variables we will use during training
    start_epoch = 0
    train_history = pd.DataFrame()
    scheduler = None
    optimizer = None

    target_metric = args.target_metric
    target_metric_mode = 'max'
    best_metric_val = 0
    best_lb_checkpoint = os.path.join(exp_dir, f'{prefix}_{target_metric}.pth')

    model = U.get_model(args.model,
                        num_classes=args.num_classes,
                        num_channels=trainset.channels(),
                        abn=args.abn,
                        use_dropout=not args.no_dropout,
                        pretrained=not args.no_pretrain).cuda()

    print('Train set size :', len(trainloader), 'batch size',
          trainloader.batch_size)
    print('Valid set size :', len(validloader), 'batch size',
          validloader.batch_size)
    print('Tile transform :', prepare_fn if prepare_fn is not None else "None")
    print('Model          :', args.model, count_parameters(model))
    print('Augmentations  :', args.augmentation, args.border_mode)
    print('Input channels :', trainset.channels())
    print('Output classes :', args.num_classes)
    print('Criterion      :', args.loss),
    print('Optimizer      :', args.optimizer, args.learning_rate,
          args.weight_decay)
    print('Use of dropout :', not args.no_dropout)
    print('Train session  :', train_session)
    print('Freeze encoder :', args.freeze_encoder)
    print('Seed           :', args.seed, args.split_seed)
    print('Restart every  :', args.restart_every)
    print('Fold           :', args.fold, args.stratify)
    print('Fine-tune      :', args.fine_tune)
    print('ABN Mode       :', args.abn)
    print('Fix masks      :', args.fix_masks)

    if args.resume:
        fname = U.auto_file(args.resume)
        start_epoch, train_history, best_score = U.restore_checkpoint(
            fname, model)
        print(train_history)
        print('Resuming training from epoch', start_epoch, ' and score',
              best_score, args.resume)

    segmentation_loss = U.get_loss(args.loss)

    if args.fine_tune and args.freeze_encoder > 0:
        raise ValueError(
            'Incompatible options --fune-tune and --freeze-encoder')

    writer = SummaryWriter(log_dir)
    writer.add_text('train/params',
                    '```' + json.dumps(train_session_args, indent=2) + '```',
                    0)

    config_fname = os.path.join(exp_dir, f'{train_session}.json')
    with open(config_fname, 'w') as f:
        f.write(json.dumps(train_session_args, indent=2))

    # Start training loop
    no_improvement_epochs = 0

    for epoch in range(start_epoch, start_epoch + args.epochs):
        # On Epoch begin
        if U.should_quit(exp_dir) or (
                args.early_stopping is not None
                and no_improvement_epochs > args.early_stopping):
            break

        epochs_trained = epoch - start_epoch
        should_restart_optimizer = (
            args.restart_every > 0 and epochs_trained % args.restart_every
            == 0) or (epochs_trained
                      == args.freeze_encoder) or optimizer is None

        if should_restart_optimizer:
            del optimizer
            if args.fine_tune:
                model.set_fine_tune(args.fine_tune)
            else:
                model.set_encoder_training_enabled(
                    epochs_trained >= args.freeze_encoder)

            trainable_parameters = filter(lambda p: p.requires_grad,
                                          model.parameters())
            optimizer = U.get_optimizer(args.optimizer,
                                        trainable_parameters,
                                        args.learning_rate,
                                        weight_decay=args.weight_decay)

            print('Restarting optimizer state', epoch, count_parameters(model))

            if args.lr_scheduler:
                scheduler = U.get_lr_scheduler(args.lr_scheduler, optimizer,
                                               args.epochs)

        if scheduler is not None and not isinstance(scheduler,
                                                    ReduceLROnPlateau):
            scheduler.step(epochs_trained)

        U.log_learning_rate(writer, optimizer, epoch)

        # Epoch
        train_metrics = process_epoch(model,
                                      segmentation_loss,
                                      optimizer,
                                      trainloader,
                                      epoch,
                                      True,
                                      writer,
                                      mask_postprocess=prepare_fn.backward)
        valid_metrics = process_epoch(model,
                                      segmentation_loss,
                                      None,
                                      validloader,
                                      epoch,
                                      False,
                                      writer,
                                      mask_postprocess=prepare_fn.backward)

        all_metrics = {}
        all_metrics.update(train_metrics)
        all_metrics.update(valid_metrics)

        # On Epoch End
        summary = {
            'epoch': [int(epoch)],
            'lr': [float(optimizer.param_groups[0]['lr'])]
        }
        for k, v in all_metrics.items():
            summary[k] = [v]

        train_history = train_history.append(pd.DataFrame.from_dict(summary),
                                             ignore_index=True)
        print(epoch, summary)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(all_metrics[target_metric], epochs_trained)

        if U.is_better(all_metrics[target_metric], best_metric_val,
                       target_metric_mode):
            best_metric_val = all_metrics[target_metric]
            U.save_checkpoint(best_lb_checkpoint,
                              model,
                              epoch,
                              train_history,
                              metric_name=target_metric,
                              metric_score=best_metric_val)
            print('Checkpoint saved', epoch, best_metric_val,
                  best_lb_checkpoint)
            no_improvement_epochs = 0
        else:
            no_improvement_epochs += 1

    print('Training finished')

    generate_model_submission(best_lb_checkpoint,
                              config_fname,
                              mine_on_val=True)
Ejemplo n.º 3
0
    net_em = models.environment_model.EnvironmentModel(obs_shape, act_n,
                                                       config)
    # net_em.load_state_dict(torch.load(config.EM_FILE_NAME, map_location=lambda storage, loc: storage))
    net_em = net_em.to(device)
    config.EM_NET = str(net_em)

    net_i2a = i2a_model_no_LSTM.I2A_FC(obs_shape, act_n, net_em, net_policy,
                                       config).to(device)
    config.I2A_NET = str(net_i2a)
    config.ROLLOUT_ENCODER = str(net_i2a.encoder)
    #    net_i2a.load_state_dict(torch.load("saves/03_i2a_test/best_pong_-018.667_1300.dat", map_location=lambda storage, loc: storage))
    #     print(net_policy)
    #     print(net_em)
    print(net_i2a)
    print("em param count: ", common.count_parameters(net_em))
    print("net_policy param count: ", common.count_parameters(net_policy))
    print("ia policy param count: ", common.count_parameters(net_i2a))

    obs = envs[0].reset()
    obs_v = ptan.agent.default_states_preprocessor([obs]).to(device)
    res = net_i2a(obs_v)

    optimizer = optim.RMSprop(net_i2a.parameters(),
                              lr=config.LEARNING_RATE,
                              eps=1e-5)
    policy_opt = optim.Adam(net_policy.parameters(), lr=config.POLICY_LR)

    trainer = lib.trainer.A2CTrainer(envs, test_env, net_i2a, optimizer,
                                     device, config)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-g',
                        '--grayscale',
                        action='store_true',
                        help='Whether to use grayscale image instead of RGB')
    parser.add_argument('-m',
                        '--model',
                        required=True,
                        type=str,
                        help='Name of the model')
    parser.add_argument('-p', '--patch-size', type=int, default=224)
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=1,
                        help='Batch Size during training, e.g. -b 64')
    parser.add_argument('-lr',
                        '--learning-rate',
                        type=float,
                        default=1e-3,
                        help='Initial learning rate')
    parser.add_argument('-l',
                        '--loss',
                        type=str,
                        default='bce',
                        help='Target loss')
    parser.add_argument('-o',
                        '--optimizer',
                        default='SGD',
                        help='Name of the optimizer')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='Epoch to run')
    parser.add_argument('-d',
                        '--dataset',
                        type=str,
                        help='Name of the dataset to use for training.')
    parser.add_argument('-dd',
                        '--data-dir',
                        type=str,
                        default='data',
                        help='Root directory where datasets are located.')
    parser.add_argument('-s',
                        '--steps',
                        type=int,
                        default=128,
                        help='Steps per epoch')
    parser.add_argument('-x',
                        '--experiment',
                        type=str,
                        help='Name of the experiment')
    parser.add_argument('-w',
                        '--workers',
                        default=0,
                        type=int,
                        help='Num workers')
    parser.add_argument('-r', '--resume', action='store_true')
    parser.add_argument('-mem', '--memory', action='store_true')
    parser.add_argument('-sgdr', action='store_true')

    args = parser.parse_args()
    cudnn.benchmark = True

    if args.experiment is None:
        args.experiment = '%s_%s_%d_%s_%s' % (
            args.dataset, args.model, args.patch_size,
            'gray' if args.grayscale else 'rgb', args.loss)

    experiment_dir = os.path.join('experiments', args.dataset, args.loss,
                                  args.experiment)
    os.makedirs(experiment_dir, exist_ok=True)

    writer = SummaryWriter(comment='_' + args.experiment)

    with open(os.path.join(experiment_dir, 'arguments.txt'), 'w') as f:
        f.write(' '.join(sys.argv[1:]))

    model = get_model(args.model,
                      patch_size=args.patch_size,
                      num_channels=1 if args.grayscale else 3)

    # Write model graph
    dummy_input = torch.autograd.Variable(
        torch.rand((args.batch_size, 1 if args.grayscale else 3,
                    args.patch_size, args.patch_size)))
    writer.add_graph(model, dummy_input)

    model = model.cuda()
    loss = get_loss(args.loss).cuda()
    optimizer = get_optimizer(args.optimizer, model.parameters(),
                              args.learning_rate)
    metrics = {
        'iou': JaccardScore().cuda(),
        'accuracy': PixelAccuracy().cuda()
    }

    trainset, validset, num_classes = get_dataset(args.dataset,
                                                  args.data_dir,
                                                  grayscale=args.grayscale,
                                                  patch_size=args.patch_size,
                                                  keep_in_mem=args.memory)
    print('Train set size', len(trainset))
    print('Valid set size', len(validset))
    print('Model         ', args.model)
    print('Parameters    ', count_parameters(model))

    trainloader = DataLoader(trainset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.workers,
                             pin_memory=True,
                             drop_last=True)
    validloader = DataLoader(validset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.workers,
                             pin_memory=True,
                             drop_last=True)

    start_epoch = 0
    best_loss = np.inf
    train_history = pd.DataFrame()

    # Checkpoint is train result of epoch with best loss
    checkpoint_filename = os.path.join(experiment_dir,
                                       f'{args.model}_checkpoint.pth')

    # Snapshot is train result of last epoch
    snapshot_filename = os.path.join(experiment_dir,
                                     f'{args.model}_snapshot.pth')

    if args.resume:
        start_epoch, train_history, best_loss = restore_snapshot(
            model, optimizer, checkpoint_filename)
        print('Resuming training from epoch', start_epoch, ' and loss',
              best_loss)
        print(train_history)

    scheduler = None
    if args.sgdr:
        scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8)

    for epoch in range(start_epoch, args.epochs):

        if scheduler is not None:
            scheduler.step(epoch)
            lrs = scheduler.get_lr()
            if len(lrs) > 1:
                writer.add_scalars('train/lr',
                                   dict(enumerate(lrs)),
                                   global_step=epoch)
            else:
                writer.add_scalar('train/lr', lrs[0], global_step=epoch)

        train_loss, train_scores = train(model,
                                         loss,
                                         optimizer,
                                         trainloader,
                                         epoch,
                                         metrics,
                                         summary_writer=writer)
        valid_loss, valid_scores = validate(model,
                                            loss,
                                            validloader,
                                            epoch,
                                            metrics,
                                            summary_writer=writer)

        summary = {
            'epoch': [epoch],
            'loss': [train_loss.avg],
            'val_loss': [valid_loss.avg]
        }

        for key, value in train_scores.items():
            summary[key] = [value.avg]

        for key, value in valid_scores.items():
            summary['val_' + key] = [value.avg]

        train_history = train_history.append(pd.DataFrame.from_dict(summary),
                                             ignore_index=True)

        print(epoch, summary)

        if valid_loss.avg < best_loss:
            save_snapshot(model, optimizer, valid_loss.avg, epoch,
                          train_history, checkpoint_filename)
            best_loss = valid_loss.avg
            print('Checkpoint saved', epoch, best_loss)

        save_snapshot(model, optimizer, valid_loss.avg, epoch, train_history,
                      snapshot_filename)

    print('Training is finished...')

    train_history.to_csv(os.path.join(experiment_dir,
                                      args.experiment + '.csv'),
                         index=False,
                         mode='a' if args.resume else 'w',
                         header=not args.resume)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-g',
                        '--grayscale',
                        action='store_true',
                        help='Whether to use grayscale image instead of RGB')
    parser.add_argument('-m',
                        '--model',
                        required=True,
                        type=str,
                        help='Name of the model')
    parser.add_argument('-p', '--patch-size', type=int, default=224)
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=1,
                        help='Batch Size during training, e.g. -b 64')
    parser.add_argument('-lr',
                        '--learning-rate',
                        type=float,
                        default=1e-3,
                        help='Initial learning rate')
    parser.add_argument('-l',
                        '--loss',
                        type=str,
                        default='bce',
                        help='Target loss')
    parser.add_argument('-o',
                        '--optimizer',
                        default='SGD',
                        help='Name of the optimizer')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='Epoch to run')
    parser.add_argument('-d',
                        '--dataset',
                        type=str,
                        help='Name of the dataset to use for training.')
    parser.add_argument('-dd',
                        '--data-dir',
                        type=str,
                        default='data',
                        help='Root directory where datasets are located.')
    parser.add_argument('-s',
                        '--steps',
                        type=int,
                        default=128,
                        help='Steps per epoch')
    parser.add_argument('-x',
                        '--experiment',
                        type=str,
                        help='Name of the experiment')
    parser.add_argument('-w',
                        '--workers',
                        default=0,
                        type=int,
                        help='Num workers')
    parser.add_argument('-r', '--resume', action='store_true')
    parser.add_argument('-mem', '--memory', action='store_true')

    args = parser.parse_args()
    cudnn.benchmark = True

    if args.experiment is None:
        args.experiment = 'torch_%s_%s_afterburn_%d_%s_%s' % (
            args.dataset, args.model, args.patch_size,
            'gray' if args.grayscale else 'rgb', args.loss)

    experiment_dir = os.path.join('experiments', args.dataset, args.loss,
                                  args.experiment)
    os.makedirs(experiment_dir, exist_ok=True)

    writer = SummaryWriter(comment=args.experiment)

    with open(os.path.join(experiment_dir, 'arguments.txt'), 'w') as f:
        f.write(' '.join(sys.argv[1:]))

    trainset, validset, num_classes = TT.get_dataset(
        args.dataset,
        args.data_dir,
        grayscale=args.grayscale,
        patch_size=args.patch_size,
        keep_in_mem=args.memory)
    print('Train set size', len(trainset))
    print('Valid set size', len(validset))

    trainloader = DataLoader(trainset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.workers,
                             pin_memory=True,
                             drop_last=True)
    validloader = DataLoader(validset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.workers,
                             pin_memory=True,
                             drop_last=True)

    head_model = TT.get_model(args.model,
                              patch_size=args.patch_size,
                              num_channels=1 if args.grayscale else 3).cuda()
    TT.restore_snapshot(head_model, None,
                        auto_file('linknet34_checkpoint.pth'))

    # Freeze model training
    for param in head_model.parameters():
        param.requires_grad = False

    afterburner = Afterburner()
    model = nn.Sequential(head_model, nn.Sigmoid(), afterburner).cuda()
    optimizer = TT.get_optimizer(args.optimizer, afterburner.parameters(),
                                 args.learning_rate)

    loss = TT.get_loss(args.loss).cuda()
    metrics = {
        'iou': JaccardScore().cuda(),
        'accuracy': PixelAccuracy().cuda()
    }

    start_epoch = 0
    best_loss = np.inf
    train_history = pd.DataFrame()

    checkpoint_filename = os.path.join(experiment_dir,
                                       f'{args.model}_checkpoint.pth')
    if args.resume:
        start_epoch, train_history, best_loss = restore_snapshot(
            model, optimizer, checkpoint_filename)
        print('Resuming training from epoch', start_epoch, ' and loss',
              best_loss)
        print(train_history)

    print('Head       :', count_parameters(head_model))
    print('Afterburner:', count_parameters(afterburner))

    for epoch in range(start_epoch, args.epochs):
        train_loss, train_scores = train(model,
                                         loss,
                                         optimizer,
                                         trainloader,
                                         epoch,
                                         metrics,
                                         summary_writer=writer)
        valid_loss, valid_scores = validate(model,
                                            loss,
                                            validloader,
                                            epoch,
                                            metrics,
                                            summary_writer=writer)

        summary = {
            'epoch': [epoch],
            'loss': [train_loss.avg],
            'val_loss': [valid_loss.avg]
        }

        for key, value in train_scores.items():
            summary[key] = [value.avg]

        for key, value in valid_scores.items():
            summary['val_' + key] = [value.avg]

        train_history = train_history.append(pd.DataFrame.from_dict(summary),
                                             ignore_index=True)

        print(epoch, summary)

        if valid_loss.avg < best_loss:
            save_snapshot(model, optimizer, valid_loss.avg, epoch,
                          train_history, checkpoint_filename)
            best_loss = valid_loss.avg
            print('Checkpoint saved', epoch, best_loss)

    print('Training is finished...')

    train_history.to_csv(os.path.join(experiment_dir,
                                      args.experiment + '.csv'),
                         index=False,
                         mode='a' if args.resume else 'w',
                         header=not args.resume)
Ejemplo n.º 6
0
    device = torch.device(config.DEVICE)
    print(config.REPLACEMENT)
    print(type(config.REPLACEMENT))
    writer = SummaryWriter(comment="_a2c_" + config.build_name_for_writer())
    saves_path = writer.logdir

    #envs used for sampling tuples of experience
    envs = [
        common.makeCustomizedGridEnv(config) for _ in range(config.NUM_ENVS)
    ]
    #env used to test the avg reward produced by the current best net
    test_env = common.makeCustomizedGridEnv(config)

    net = common.getNet(device, config)
    print(common.count_parameters(net))
    config.A2CNET = str(net)

    #sets seed on torch operations and on all environments
    common.set_seed(seed=config.SEED, envs=envs)
    common.set_seed(seed=config.SEED, envs=[test_env])

    optimizer = optim.Adam(net.parameters(), lr=config.LEARNING_RATE, eps=1e-5)

    trainer = lib.trainer.A2CTrainer(envs, test_env, net, optimizer, device,
                                     config)

    epoch = 0
    total_steps = 0
    best_reward = None
    ts_start = time.time()
Ejemplo n.º 7
0
    def maybe_drop(self, x, p=0.5):
        if self.use_dropout:
            x = F.dropout(x, p, training=self.training)
        return x

    def set_fine_tune(self, fine_tune_enabled):
        layers = [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5]
        for layer in layers:
            for param in layer.parameters():
                param.requires_grad = bool(not fine_tune_enabled)

    def set_encoder_training_enabled(self, enabled):
        # First layer is trainable since we use 1-channel image instead of 3-channel
        layers = [self.conv2, self.conv3, self.conv4, self.conv5]
        for layer in layers:
            for param in layer.parameters():
                param.requires_grad = bool(enabled)


if __name__ == '__main__':
    net = TernausNetOC(num_classes=1, num_channels=1)
    net = net.eval()
    print(count_parameters(net))

    x = {'image': torch.rand((4, 1, 128, 128)),
         'depth': torch.rand((4))
         }
    y = net(x)
    print(y['mask'].size())
Ejemplo n.º 8
0
    net.load_state_dict(
        torch.load(config.A2C_FILE_NAME,
                   map_location=lambda storage, loc: storage))
    net = net.to(device)
    config.A2CNET = str(net)

    #    net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n)
    net_em = models.environment_model.EnvironmentModel(
        envs[0].observation_space.shape, envs[0].action_space.n,
        config).to(device)
    #    net_em.load_state_dict(torch.load("/home/valy/OneDrive/experiments/repl/9_22/Jan19_20-40-19_valy_em_22_9_True/best_1.4249e-06_195121.dat", map_location=lambda storage, loc: storage))
    config.EM_NET = str(net_em)

    print(net)
    print(net_em)
    print("em param count: " + str(common.count_parameters(net_em)))

    # sets seed on torch operations and on all environments
    common.set_seed(seed=config.SEED, envs=envs)

    optimizer = optim.Adam(net_em.parameters(), lr=config.LEARNING_RATE)

    epoch = 0
    best_loss = np.inf
    desc = ""
    pbar = trange(config.EM_STEPS, desc='', leave=True)
    progress = iter(pbar)

    with ptan.common.utils.TBMeanTracker(
            writer, batch_size=config.BATCH_SIZE) as tb_tracker:
        #obtain batch transitions from the a2c model free agent (st, at, st+1, r)
Ejemplo n.º 9
0
            encoder_outs.append(before_pool)

        for i, module in enumerate(self.up_convs):
            before_pool = encoder_outs[-(i + 2)]
            x = module(before_pool, x)

        # No softmax is used. This means you need to use
        # nn.CrossEntropyLoss is your training script,
        # as this module includes a softmax already.
        x = self.drop_final(x)
        x = self.conv_final(x)
        return x

    def set_fine_tune(self, fine_tune_enabled):
        pass

    def set_encoder_training_enabled(self, enabled):
        pass


if __name__ == "__main__":
    """
    testing
    """
    model = UNet(num_classes=1, num_channels=1, depth=5,
                 merge_mode='concat').eval()
    x = torch.rand((1, 1, 128, 128))
    out = model(x)
    print(out.size())
    print(count_parameters(model))
Ejemplo n.º 10
0
def resnext50(**kwargs):
    return ResNeXt([3, 4, 6, 3], **kwargs)


def resnext101(pretrained=True, input_3x3=True, abn_block=ABN, **kwargs):
    model = ResNeXt([3, 4, 23, 3],
                    input_3x3=input_3x3,
                    abn_block=abn_block,
                    classes=1000,
                    **kwargs)
    if pretrained and input_3x3:
        checkpoint = torch.load(
            os.path.join('pretrain', 'resnext101_ipabn_lr_512.pth.tar'))
        state_dict = checkpoint['state_dict']
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove `module.`
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)
    return model


def resnext152(**kwargs):
    return ResNeXt([3, 8, 36, 3], **kwargs)


if __name__ == '__main__':
    print(count_parameters(resnext50()))
    print(count_parameters(resnext101()))
    print(count_parameters(resnext152()))