Esempio n. 1
0
def test_rle_encode_decode():
    train_pred = auto_file(
        'Oct10_20_28_dpn_128_medium_wonderful_goldberg_val_lb.pth_train_predictions.npz'
    )
    train_pred = np.load(train_pred)

    train_ids = D.all_train_ids()
    true_masks = D.read_train_masks(train_ids)
    pred_masks = np.array([train_pred[id] for id in train_ids])
    pred_masks = (pred_masks > 0.45).astype(np.uint8)

    submit = create_submission(train_ids, pred_masks)
    submit.to_csv('test_rle_encode_decode.csv.gz',
                  compression='gzip',
                  index=False)

    decoded_ids, decoded_masks = decode_submission(
        'test_rle_encode_decode.csv.gz')
    decoded_masks = dict(zip(decoded_ids, decoded_masks))
    assert set(decoded_ids) == set(train_ids)

    decoded_masks = np.array([decoded_masks[id] for id in train_ids])

    p1, r1, _ = do_kaggle_metric(pred_masks, true_masks)
    p2, r2, _ = do_kaggle_metric(decoded_masks, true_masks)

    assert np.array_equal(p1, p2)
    assert np.array_equal(r1, r2)
    print(np.mean(p1), np.mean(p2))
Esempio n. 2
0
def test_pixel_acc():
    ids = D.all_train_ids()
    y_true = D.read_train_masks(ids)
    y_pred = np.load('experiments/Sep14_18_14_ternaus_netv3_naughty_roentgen/Sep14_18_14_ternaus_netv3_naughty_roentgen_best_lb.pth_train_predictions.npz')
    y_pred = np.array([y_pred[x] for x in ids])

    acc = M.PixelAccuracy()
    acc.update(torch.from_numpy(y_pred), torch.from_numpy(y_true))
    print(acc.value())
Esempio n. 3
0
def test_get_selection_mask():
    train_ids = D.all_train_ids()
    train_images = D.read_train_images(train_ids)
    black_images, no_salt, full_salt, vstrips, one_pixel_salt, few_salt, few_non_salt = D.find_problematic_masks(
        train_ids, few_pixels_threshold=8)

    mask = D.get_selection_mask(train_ids, vstrips)
    vstrips_images = train_images[mask]
    print(len(vstrips_images))
Esempio n. 4
0
def test_kaggle_metric():
    ids = D.all_train_ids()
    y_true = D.read_train_masks(ids)
    # y_pred = np.load('experiments/Sep14_18_14_ternaus_netv3_naughty_roentgen/Sep14_18_14_ternaus_netv3_naughty_roentgen_best_lb.pth_train_predictions.npz')
    # y_pred = np.array([y_pred[x] for x in ids])
    # y_pred = y_true.copy()
    # print(y_pred.min(), y_pred.max())

    # print(np.count_nonzero(y_pred > 0), np.count_nonzero(y_true))
    # print(np.sum(y_true == (y_pred > 0)) / float(np.prod(y_true.shape)))


    precision, result, threshold = do_kaggle_metric(y_true, y_true, 0.5)
    print(np.mean(precision))
Esempio n. 5
0
def get_test_dataset(dataset: str, prepare, test_or_train='test'):
    if test_or_train == 'test':
        ids = D.all_test_ids()
        images = D.read_test_images(ids)
    else:
        ids = D.all_train_ids()
        images = D.read_train_images(ids)

    depths = D.read_depths(ids)

    use_cumsum = (dataset == 'image_depth_cumsum' or dataset == 'image_cumsum')
    use_depth = (dataset == 'image_depth' or dataset == 'image_depth_cumsum')

    dataset = D.ImageAndMaskDataset(ids, images, None, depths,
                                    prepare_fn=prepare)
    return dataset
Esempio n. 6
0
def test_fix_masks():
    train_ids = D.all_train_ids()
    masks = D.read_train_masks(train_ids)
    new_masks, changed_ids = D.fix_masks(masks, train_ids)
    print(len(changed_ids))

    dst = 'test/out/test_fix_masks'
    os.makedirs(dst, exist_ok=True)

    idx = D.get_selection_mask(train_ids, changed_ids)

    for id, old, new in zip(changed_ids, masks[idx], new_masks[idx]):
        image = np.concatenate((old, new), 1)
        fname = f'{id}.png'
        image = cv2.resize(image, (image.shape[1] * 2, image.shape[0] * 2),
                           interpolation=cv2.INTER_NEAREST)
        cv2.imwrite(os.path.join(dst, fname), image * 255)
Esempio n. 7
0
def extract_oof_predictions(model) -> dict:
    test_predictions = auto_file(f'{model}_test_predictions.npz')
    train_predictions = auto_file(f'{model}_train_predictions.npz')

    experiment_dir = os.path.dirname(test_predictions)

    json_config = [fname for fname in sorted(os.listdir(experiment_dir)) if os.path.splitext(fname)[1] == '.json']
    json_config = auto_file(json_config[0])

    config = json.load(open(json_config))
    stratify = config['stratify']
    fold = config['fold']

    train_ids = D.all_train_ids()
    train_indexes, test_indexes = D.get_train_test_split_for_fold(stratify, fold, train_ids)
    train_predictions = np.load(train_predictions)

    valid_ids = train_ids[test_indexes]
    valid_predictions = np.array([train_predictions[id] for id in valid_ids])
    oof_predictions = dict(zip(valid_ids, valid_predictions))

    np.savez_compressed(os.path.join(experiment_dir, f'{model}_oof_predictions.npz'), **oof_predictions)
    return oof_predictions
Esempio n. 8
0
def main():
    one_over_255 = float(1. / 255.)

    print(compute_mean_std(find_in_dir('data/train/images')))
    print(compute_mean_std(find_in_dir('data/test/images')))
    print(
        compute_mean_std(
            find_in_dir('data/train/images') +
            (find_in_dir('data/test/images'))))

    train = one_over_255 * D.read_train_images(D.all_train_ids())
    test = one_over_255 * D.read_test_images(D.all_test_ids())
    print(train.mean(), train.std())
    print(test.mean(), test.std())
    all = np.concatenate([train, test], axis=0)
    print(all.mean(), all.std())

    mean = train.mean()
    std = train.std()

    train -= mean
    train /= std
    print(train.mean(), train.std())
Esempio n. 9
0
def make_cv_submit(inputs, prefix, output_dir='submits'):
    os.makedirs(output_dir, exist_ok=True)

    test_predictions = [auto_file(f'{model}_test_predictions.npz') for model in inputs]
    oof_predictions = [auto_file(f'{model}_oof_predictions.npz') for model in inputs]

    train_ids = D.all_train_ids()
    true_masks = D.read_train_masks(train_ids)
    test_ids = D.all_test_ids()

    pred_masks = merge_oof(oof_predictions, train_ids)
    threshold, lb_score = threshold_mining(pred_masks, true_masks, min_threshold=0.1, max_threshold=0.9, step=0.001)

    i = np.argmax(lb_score)
    threshold, lb_score = float(threshold[i]), float(lb_score[i])
    print('Threshold', threshold, 'CV score', lb_score)

    # Arithmetic
    ensembled_test_pred = ensemble(test_predictions, test_ids, averaging=ArithmeticMean)
    ensembled_test_pred = ensembled_test_pred > threshold

    submit_file = f'{prefix}_a_mean_CV_{lb_score:.4f}_TH{threshold:.4f}.csv.gz'
    create_submission(test_ids, ensembled_test_pred).to_csv(os.path.join(output_dir, submit_file), compression='gzip', index=False)
    print('Saved submission', submit_file)

    postprocess = morphology_postprocess
    if postprocess is not None:
        final_masks = []
        for image, mask in zip(D.read_test_images(test_ids), ensembled_test_pred):
            mask = postprocess(image, mask)
            final_masks.append(mask)
        test_predictions = np.array(final_masks)

        submit_file = f'{prefix}_a_mean_PPC_CV_{lb_score:.4f}_TH{threshold:.4f}.csv.gz'
        create_submission(test_ids, test_predictions).to_csv(os.path.join(output_dir, submit_file), compression='gzip', index=False)
        print('Saved submission', submit_file)
Esempio n. 10
0
def test_folds_coverage():
    train_ids = D.all_train_ids()
    depths = D.read_depths(train_ids)
    images = D.read_train_images(train_ids)
    masks = D.read_train_masks(train_ids)

    n_folds = 10
    coverage = np.array([cv2.countNonZero(x) for x in masks], dtype=np.int)
    folds_d = D.get_folds_vector('coverage',
                                 images,
                                 masks,
                                 depths,
                                 n_folds=n_folds)

    f, ax = plt.subplots(1, 2)

    for fold in range(n_folds):
        train = coverage[folds_d != fold]
        val = coverage[folds_d == fold]

        ax[0].hist(train, label=f'Fold {fold}')
        ax[1].hist(val, label=f'Fold {fold}')

    f.show()
Esempio n. 11
0
def main():
    parser = U.get_argparser()
    args = parser.parse_args()
    U.set_manual_seed(args.seed)

    train_session_args = vars(args)
    train_session = U.get_random_name()
    current_time = datetime.now().strftime('%b%d_%H_%M')
    prefix = f'{current_time}_{args.model}_{args.prepare}_{args.augmentation}_{train_session}'
    if args.fold is not None:
        prefix += f'_fold_{args.stratify}_{args.fold}'

    log_dir = os.path.join('runs', prefix)
    exp_dir = os.path.join('experiments', args.model, args.prepare,
                           args.augmentation, prefix)
    os.makedirs(exp_dir, exist_ok=True)

    train_ids = D.all_train_ids()
    depths = D.read_depths(train_ids)
    images = D.read_train_images(train_ids)
    masks = D.read_train_masks(train_ids)

    if args.fix_masks:
        masks, changed_ids = D.fix_masks(masks, train_ids)
        with open(os.path.join(exp_dir, 'fixed_masks.txt'), 'w') as f:
            for sample_id in changed_ids:
                f.write(sample_id)
                f.write('\n')
        print(f'Fixed {len(changed_ids)} masks')

    if args.fold is not None:
        train_indexes, test_indexes = D.get_train_test_split_for_fold(
            args.stratify, args.fold, train_ids)
    else:
        train_indexes, test_indexes = train_test_split(
            np.arange(len(train_ids)),
            shuffle=False,
            random_state=args.split_seed,
            test_size=0.2)

    ids_train, ids_test = train_ids[train_indexes], train_ids[test_indexes]
    img_train, img_test = images[train_indexes], images[test_indexes]
    mask_train, mask_test = masks[train_indexes], masks[test_indexes]
    depth_train, depth_test = depths[train_indexes], depths[test_indexes]

    # Here we can exclude some images from training, but keep in validation
    train_mask = D.drop_some(img_train,
                             mask_train,
                             drop_black=True,
                             drop_vstrips=args.drop_vstrips,
                             drop_few=args.drop_few)
    ids_train = ids_train[train_mask]
    img_train = img_train[train_mask]
    mask_train = mask_train[train_mask]
    depth_train = depth_train[train_mask]

    if not is_sorted(ids_train):
        raise RuntimeError("ids_train is not sorted")
    if not is_sorted(ids_test):
        raise RuntimeError("ids_test_sorted is not sorted")

    prepare_fn = D.get_prepare_fn(args.prepare, **train_session_args)

    # This line valid if we apply prepare_fn first and then do augmentation
    target_size = prepare_fn.target_size if prepare_fn is not None else D.ORIGINAL_SIZE
    # target_size = D.ORIGINAL_SIZE

    build_augmentation_fn = D.AUGMENTATION_MODES[args.augmentation]
    aug = build_augmentation_fn(target_size, border_mode=args.border_mode)

    train_transform_list = []
    valid_transform_list = []
    if prepare_fn is not None:
        train_transform_list.append(prepare_fn.t_forward)
        valid_transform_list.append(prepare_fn.t_forward)

    train_transform_list.append(aug)

    trainset = D.ImageAndMaskDataset(ids_train,
                                     img_train,
                                     mask_train,
                                     depth_train,
                                     augment=A.Compose(train_transform_list))

    validset = D.ImageAndMaskDataset(ids_test,
                                     img_test,
                                     mask_test,
                                     depth_test,
                                     augment=A.Compose(valid_transform_list))

    trainloader = DataLoader(trainset,
                             batch_size=args.batch_size,
                             num_workers=args.workers,
                             pin_memory=True,
                             drop_last=True,
                             shuffle=True)

    validloader = DataLoader(validset,
                             batch_size=args.batch_size,
                             pin_memory=True,
                             drop_last=False,
                             shuffle=False)

    # Save train/val split for future use
    train_session_args.update({
        'train_set': list(ids_train),
        'valid_set': list(ids_test)
    })

    # Declare variables we will use during training
    start_epoch = 0
    train_history = pd.DataFrame()

    target_metric = args.target_metric
    target_metric_mode = 'max'
    best_metric_val = 0
    best_lb_checkpoint = os.path.join(exp_dir, f'{prefix}_{target_metric}.pth')

    model = U.get_model(args.model,
                        num_classes=args.num_classes,
                        num_channels=trainset.channels(),
                        abn=args.abn,
                        use_dropout=not args.no_dropout,
                        pretrained=not args.no_pretrain).cuda()

    print('Train set size :', len(ids_train), 'batch size',
          trainloader.batch_size)
    print('Valid set size :', len(ids_test), 'batch size',
          validloader.batch_size)
    print('Tile transform :', prepare_fn if prepare_fn is not None else "None")
    print('Model          :', args.model, count_parameters(model))
    print('Augmentations  :', args.augmentation, args.border_mode)
    print('Input channels :', trainset.channels())
    print('Output classes :', args.num_classes)
    print('Optimizer      :', args.optimizer, 'wd', args.weight_decay)
    print('Use of dropout :', not args.no_dropout)
    print('Train session  :', train_session)
    print('Freeze encoder :', args.freeze_encoder)
    print('Seed           :', args.seed, args.split_seed)
    print('Restart every  :', args.restart_every)
    print('Fold           :', args.fold, args.stratify)
    print('Fine-tune      :', args.fine_tune)
    print('ABN Mode       :', args.abn)
    print('Fix masks      :', args.fix_masks)

    if args.resume:
        fname = U.auto_file(args.resume)
        start_epoch, train_history, best_score = U.restore_checkpoint(
            fname, model)
        print(train_history)
        print('Resuming training from epoch', start_epoch, ' and score',
              best_score, args.resume)

    if args.fine_tune and args.freeze_encoder > 0:
        raise ValueError(
            'Incompatible options --fune-tune and --freeze-encoder')

    writer = SummaryWriter(log_dir)
    writer.add_text('train/params',
                    '```' + json.dumps(train_session_args, indent=2) + '```',
                    0)

    config_fname = os.path.join(exp_dir, f'{train_session}.json')
    with open(config_fname, 'w') as f:
        f.write(json.dumps(train_session_args, indent=2))

    weights = {
        'mask': 1.0,
        'class': 0.05,
        'dsv': 0.1,
    }

    bce = U.get_loss('bce')
    bce_lovasz = U.get_loss('bce_lovasz')
    bce_jaccard = U.get_loss('bce_jaccard')

    losses = {
        'warmup': {
            'mask': bce,
            'class': bce,
            'dsv': bce,
        },
        'main': {
            'mask': bce_jaccard,
            'class': bce,
            'dsv': bce,
        },
        'annealing': {
            'mask': bce_lovasz,
            'class': bce,
            'dsv': bce,
        }
    }

    epochs = {'warmup': 50, 'main': 250, 'annealing': 50}

    if args.fast:
        for key in epochs.keys():
            epochs[key] = 1

    learning_rates = {
        'warmup': args.learning_rate,
        'main': 1e-3,
        'annealing': 1e-2
    }

    # Warmup phase
    if epochs['warmup']:
        print(torch.cuda.max_memory_allocated(),
              torch.cuda.max_memory_cached())
        trainable_parameters = filter(lambda p: p.requires_grad,
                                      model.parameters())
        optimizer = U.get_optimizer(args.optimizer,
                                    trainable_parameters,
                                    learning_rates['warmup'],
                                    weight_decay=args.weight_decay)
        scheduler = None  # StepLR(optimizer, gamma=0.5, step_size=50)

        train_history, best_metric_val, start_epoch = train(
            model,
            losses['warmup'],
            weights,
            optimizer,
            scheduler,
            trainloader,
            validloader,
            writer,
            start_epoch,
            epochs=epochs['warmup'],
            early_stopping=args.early_stopping,
            train_history=train_history,
            experiment_dir=exp_dir,
            target_metric=target_metric,
            best_metric_val=best_metric_val,
            target_metric_mode=target_metric_mode,
            checkpoint_filename=best_lb_checkpoint)
        U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_warmup.pth'),
                          model,
                          start_epoch,
                          train_history,
                          metric_name=target_metric,
                          metric_score=best_metric_val)

        del trainable_parameters, optimizer, scheduler
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

        print('Finished warmup phase. Main train loop.')

    # Main training phase
    print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached())
    trainable_parameters = filter(lambda p: p.requires_grad,
                                  model.parameters())
    optimizer = U.get_optimizer(args.optimizer,
                                trainable_parameters,
                                learning_rates['main'],
                                weight_decay=args.weight_decay)
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode='max',
                                  patience=50,
                                  factor=0.5,
                                  min_lr=1e-5)

    train_history, best_metric_val, start_epoch = train(
        model,
        losses['main'],
        weights,
        optimizer,
        scheduler,
        trainloader,
        validloader,
        writer,
        start_epoch,
        epochs=epochs['main'],
        early_stopping=args.early_stopping,
        train_history=train_history,
        experiment_dir=exp_dir,
        target_metric=target_metric,
        best_metric_val=best_metric_val,
        target_metric_mode=target_metric_mode,
        checkpoint_filename=best_lb_checkpoint)
    del trainable_parameters, optimizer, scheduler
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    snapshots = [best_lb_checkpoint]

    U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_main.pth'),
                      model,
                      start_epoch,
                      train_history,
                      metric_name=target_metric,
                      metric_score=best_metric_val)

    print('Finished train phase.')

    # Cosine annealing
    if epochs['annealing']:

        for snapshot in range(5):
            print(f'Starting annealing phase {snapshot}')
            print(torch.cuda.max_memory_allocated(),
                  torch.cuda.max_memory_cached())
            # model.set_fine_tune(True)
            trainable_parameters = filter(lambda p: p.requires_grad,
                                          model.parameters())
            optimizer = U.get_optimizer('sgd',
                                        trainable_parameters,
                                        learning_rates['annealing'],
                                        weight_decay=args.weight_decay)
            scheduler = CosineAnnealingLR(optimizer,
                                          epochs['annealing'],
                                          eta_min=1e-7)

            snapshot_name = os.path.join(
                exp_dir, f'{prefix}_{target_metric}_snapshot_{snapshot}.pth')
            snapshots.append(snapshot_name)
            train_history, best_metric_val, start_epoch = train(
                model,
                losses['annealing'],
                weights,
                optimizer,
                scheduler,
                trainloader,
                validloader,
                writer,
                start_epoch,
                epochs=epochs['annealing'],
                early_stopping=args.early_stopping,
                train_history=train_history,
                experiment_dir=exp_dir,
                target_metric=target_metric,
                best_metric_val=0,
                target_metric_mode=target_metric_mode,
                checkpoint_filename=snapshot_name)
            del trainable_parameters, optimizer, scheduler
            torch.cuda.empty_cache()
            torch.cuda.synchronize()

    print('Training finished')
    train_history.to_csv(os.path.join(exp_dir, 'train_history.csv'),
                         index=False)

    for snapshot_file in snapshots:
        generate_model_submission(snapshot_file,
                                  config_fname,
                                  mine_on_val=True)
Esempio n. 12
0
def test_map():
    train_id = D.all_train_ids()
    masks = D.read_train_masks(train_id)
    print(M.threshold_mining(masks, masks))
def test_inspect_train_predictions():
    train_ids = D.all_train_ids()
    train_images = D.read_train_images(train_ids)
    train_masks = D.read_train_masks(train_ids)
    print(train_ids.shape, train_images.shape, train_masks.shape)

    CONFIG = auto_file('wonderful_goldberg.json')
    WEIGHT_TRAIN = auto_file(
        'Oct10_20_28_dpn_128_medium_wonderful_goldberg_val_lb.pth_train_predictions.npz'
    )
    WEIGHT_TEST = auto_file(
        'Oct10_20_28_dpn_128_medium_wonderful_goldberg_val_lb.pth_test_predictions.npz'
    )

    convert_predictions_to_images(WEIGHT_TEST,
                                  os.path.join('test', 'test_predictions'))
    convert_predictions_to_images(WEIGHT_TRAIN,
                                  os.path.join('test', 'train_predictions'))

    train_predictions = auto_file(WEIGHT_TRAIN)
    train_predictions = np.load(train_predictions)

    # image = train_predictions['0aab0afa9c']

    train_predictions = np.array([train_predictions[id] for id in train_ids])
    print(train_predictions.shape)

    threshold, lb_score = threshold_mining(train_predictions,
                                           train_masks,
                                           min_threshold=0.15,
                                           max_threshold=0.85,
                                           step=0.005)

    plt.figure()
    plt.plot(threshold, lb_score)
    plt.tight_layout()

    i = np.argmax(lb_score)
    best_threshold, best_lb_score = float(threshold[i]), float(lb_score[i])
    print(best_threshold, best_lb_score)

    config_file = auto_file(CONFIG)

    config = json.load(open(config_file))
    valid_ids = np.array(config['valid_set'])
    valid_mask = D.get_selection_mask(train_ids, valid_ids)
    val_threshold, val_lb_score = threshold_mining(
        train_predictions[valid_mask],
        train_masks[valid_mask],
        min_threshold=0.15,
        max_threshold=0.85,
        step=0.005)

    plt.figure()
    plt.plot(val_threshold, val_lb_score)
    plt.tight_layout()
    plt.show()

    val_i = np.argmax(val_lb_score)
    val_th = val_threshold[val_i]
    print(val_threshold[val_i], val_lb_score[val_i])

    precision, result, threshold = do_kaggle_metric(train_predictions,
                                                    train_masks, val_th)

    x = []
    y = []
    for prec, true_mask in zip(precision, train_masks):
        x.append(prec)
        y.append(cv2.countNonZero(true_mask))

    plt.figure()
    plt.scatter(x, y)
    plt.tight_layout()
    plt.show()