def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('root', help='checkpoint root')
    arg('--batch-size', type=int, default=32)
    arg('--patch-size', type=int, default=256)
    arg('--n-epochs', type=int, default=100)
    arg('--lr', type=float, default=0.0001)
    arg('--workers', type=int, default=2)
    arg('--fold', type=int, default=1)
    arg('--n-folds', type=int, default=5)
    arg('--stratified', action='store_true')
    arg('--mode',
        choices=[
            'train', 'validation', 'predict_valid', 'predict_test',
            'predict_all_valid'
        ],
        default='train')
    arg('--clean', action='store_true')
    arg('--epoch-size', type=int)
    arg('--limit', type=int, help='Use only N images for train/valid')
    arg('--min-scale', type=float, default=1)
    arg('--max-scale', type=float, default=1)
    arg('--test-scale', type=float, default=0.5)
    args = parser.parse_args()

    coords = utils.load_coords()
    train_paths, valid_paths = utils.train_valid_split(args, coords)
    root = Path(args.root)
    model = SSPD()
    model = utils.cuda(model)
    criterion = SSPDLoss()

    if args.mode == 'train':
        kwargs = dict(min_scale=args.min_scale, max_scale=args.max_scale)
        train_loader, valid_loader = (utils.make_loader(
            PointDataset, args, train_paths, coords, **kwargs),
                                      utils.make_loader(PointDataset,
                                                        args,
                                                        valid_paths,
                                                        coords,
                                                        deterministic=True,
                                                        **kwargs))
        if root.exists() and args.clean:
            shutil.rmtree(str(root))
        root.mkdir(exist_ok=True)
        root.joinpath('params.json').write_text(
            json.dumps(vars(args), indent=True, sort_keys=True))
        utils.train(args,
                    model,
                    criterion,
                    train_loader=train_loader,
                    valid_loader=valid_loader,
                    save_predictions=save_predictions)
Example #2
0
def load_all_features(root: Path, only_valid: bool, args) -> Dict[str, np.ndarray]:
    features_path = root.joinpath('features.npz')  # type: Path
    coords = utils.load_coords()
    pred_paths = list(root.glob('*-pred.npy'))
    get_id = lambda p: int(p.name.split('-')[0])
    if only_valid:
        valid_ids = {int(p.stem) for p in utils.labeled_paths()}
        pred_paths = [p for p in pred_paths if get_id(p) in valid_ids]
    if args.limit:
        pred_paths = pred_paths[:args.limit]
    if not args.new_features and features_path.exists():
        print('Loading features...')
        data = dict(np.load(str(features_path)))
        clf_features_path = root.joinpath('clf_features.npz')
        if clf_features_path.exists():
            clf_features = np.load(str(clf_features_path))['xs']
            data['xs'] = np.concatenate([data['xs'], clf_features], axis=2)
            for i in range(clf_features.shape[2]):
                feature_name = 'clf-{}'.format(i)
                ALL_FEATURE_NAMES.append(feature_name)
                FEATURE_NAMES.append(feature_name)
        print('done.')
        ids = [get_id(p) for p in pred_paths]
        assert set(ids) == set(data['ids'][0])
        return data
    print('{} total'.format(len(pred_paths)))
    data = {k: [[] for _ in range(utils.N_CLASSES)]
            for k in ['ids', 'scales', 'xs', 'ys']}
    blob_data = {k: [[] for _ in range(utils.N_CLASSES)]
                 for k in ['blobs', 'blob_ids']}
    with multiprocessing.pool.Pool(processes=24) as pool:
        for id, scale, xs, ys, blobs, blob_ids in tqdm.tqdm(
                pool.imap(partial(load_xs_ys, coords=coords), pred_paths, chunksize=2),
                total=len(pred_paths)):
            for cls in range(utils.N_CLASSES):
                data['ids'][cls].extend([id] * len(ys[cls]))
                data['scales'][cls].extend([scale] * len(ys[cls]))
                data['xs'][cls].extend(xs[cls])
                data['ys'][cls].extend(ys[cls])
                blob_data['blobs'][cls].append((id, scale, blobs[cls]))
                blob_data['blob_ids'][cls].extend(blob_ids[cls])
    data = {k: np.array(v, dtype=np.int32 if k in {'ids', 'ys'} else np.float32)
            for k, v in data.items()}
    with features_path.open('wb') as f:
        np.savez(f, **data)
    with root.joinpath('blobs.pkl').open('wb') as f:
        pickle.dump(blob_data, f)
    return data
Example #3
0
 def __init__(self, fundus_dir, vessel_dir, od_label_path, fovea_label_path,
              batch_size):
     self.fundus, self.vessel, self.coords = utils.load_coords(
         fundus_dir, vessel_dir, od_label_path, fovea_label_path)
     self.is_train = False
     super(ValidationBatchFetcher, self).__init__(batch_size)
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('root', help='checkpoint root')
    arg('--batch-size', type=int, default=32)
    arg('--patch-size', type=int, default=256)
    arg('--n-epochs', type=int, default=100)
    arg('--lr', type=float, default=0.0001)
    arg('--workers', type=int, default=2)
    arg('--fold', type=int, default=1)
    arg('--bg-weight', type=float, default=1.0, help='background weight')
    arg('--dice-weight', type=float, default=0.0)
    arg('--n-folds', type=int, default=5)
    arg('--stratified', action='store_true')
    arg('--mode',
        choices=[
            'train', 'valid', 'predict_valid', 'predict_test',
            'predict_all_valid'
        ],
        default='train')
    arg('--model-path',
        help='path to model file to use for validation/prediction')
    arg('--clean', action='store_true')
    arg('--epoch-size', type=int)
    arg('--limit', type=int, help='Use only N images for train/valid')
    arg('--min-scale', type=float, default=1)
    arg('--max-scale', type=float, default=1)
    arg('--test-scale', type=float, default=0.5)
    arg('--oversample',
        type=float,
        default=0.0,
        help='sample near lion with given probability')
    arg('--with-head', action='store_true')
    arg('--pred-oddity',
        type=int,
        help='set to 0/1 to predict even/odd images')
    args = parser.parse_args()

    coords = utils.load_coords()
    train_paths, valid_paths = utils.train_valid_split(args)
    root = Path(args.root)
    model = UNetWithHead() if args.with_head else UNet()
    model = utils.cuda(model)
    criterion = Loss(dice_weight=args.dice_weight, bg_weight=args.bg_weight)
    loader_kwargs = dict(
        min_scale=args.min_scale,
        max_scale=args.max_scale,
        downscale=args.with_head,
    )
    if args.mode == 'train':
        train_loader, valid_loader = (utils.make_loader(
            SegmentationDataset,
            args,
            train_paths,
            coords,
            oversample=args.oversample,
            **loader_kwargs),
                                      utils.make_loader(SegmentationDataset,
                                                        args,
                                                        valid_paths,
                                                        coords,
                                                        deterministic=True,
                                                        **loader_kwargs))
        if root.exists() and args.clean:
            shutil.rmtree(str(root))  # remove dir tree
        root.mkdir(exist_ok=True)
        root.joinpath('params.json').write_text(
            json.dumps(vars(args), indent=True, sort_keys=True))
        utils.train(args,
                    model,
                    criterion,
                    train_loader=train_loader,
                    valid_loader=valid_loader,
                    save_predictions=save_predictions)
    elif args.mode == 'valid':
        utils.load_best_model(model, root, args.model_path)
        valid_loader = utils.make_loader(SegmentationDataset,
                                         args,
                                         valid_paths,
                                         coords,
                                         deterministic=True,
                                         **loader_kwargs)
        utils.validation(model, criterion,
                         tqdm.tqdm(valid_loader, desc='Validation'))
    else:
        utils.load_best_model(model, root, args.model_path)
        if args.mode in {'predict_valid', 'predict_all_valid'}:
            if args.mode == 'predict_all_valid':
                # include all paths we did not train on (makes sense only with --limit)
                valid_paths = list(
                    set(valid_paths)
                    | (set(utils.labeled_paths()) - set(train_paths)))
            predict(model,
                    valid_paths,
                    out_path=root,
                    patch_size=args.patch_size,
                    batch_size=args.batch_size,
                    min_scale=args.min_scale,
                    max_scale=args.max_scale,
                    downsampled=args.with_head)
        elif args.mode == 'predict_test':
            out_path = root.joinpath('test')
            out_path.mkdir(exist_ok=True)
            predicted = {p.stem.split('-')[0] for p in out_path.glob('*.npy')}
            test_paths = [
                p for p in utils.DATA_ROOT.joinpath('Test').glob('*.png')
                if p.stem not in predicted
            ]
            if args.pred_oddity is not None:
                assert args.pred_oddity in {0, 1}
                test_paths = [
                    p for p in test_paths
                    if int(p.stem) % 2 == args.pred_oddity
                ]
            predict(model,
                    test_paths,
                    out_path,
                    patch_size=args.patch_size,
                    batch_size=args.batch_size,
                    test_scale=args.test_scale,
                    is_test=True,
                    downsampled=args.with_head)
        else:
            parser.error('Unexpected mode {}'.format(args.mode))
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('root', help='checkpoint root')
    arg('out_path', help='path to UNet features', type=Path)
    arg('--batch-size', type=int, default=32)
    arg('--patch-size', type=int, default=160)
    arg('--offset', type=int, default=6)
    arg('--n-epochs', type=int, default=100)
    arg('--lr', type=float, default=0.0001)
    arg('--workers', type=int, default=2)
    arg('--fold', type=int, default=1)
    arg('--n-folds', type=int, default=5)
    arg('--stratified', action='store_true')
    arg('--mode',
        choices=[
            'train', 'valid', 'predict_valid', 'predict_test',
            'predict_all_valid'
        ],
        default='train')
    arg('--model-path',
        help='path to model file to use for validation/prediction')
    arg('--clean', action='store_true')
    arg('--epoch-size', type=int)
    arg('--limit', type=int, help='Use only N images for train/valid')
    arg('--min-scale', type=float, default=1)
    arg('--max-scale', type=float, default=1)
    arg('--test-scale', type=float, default=0.5)
    arg('--pred-oddity',
        type=int,
        help='set to 0/1 to predict even/odd images')
    args = parser.parse_args()

    coords = utils.load_coords()
    train_paths, valid_paths = utils.train_valid_split(args, coords)
    root = Path(args.root)
    model = VGGModel(args.patch_size)
    model = utils.cuda(model)
    criterion = nn.CrossEntropyLoss()
    loader_kwargs = dict(min_scale=args.min_scale,
                         max_scale=args.max_scale,
                         offset=args.offset)
    if args.mode == 'train':
        train_loader, valid_loader = (utils.make_loader(
            ClassificationDataset, args, train_paths, coords, **loader_kwargs),
                                      utils.make_loader(ClassificationDataset,
                                                        args,
                                                        valid_paths,
                                                        coords,
                                                        deterministic=True,
                                                        **loader_kwargs))
        if root.exists() and args.clean:
            shutil.rmtree(str(root))
        root.mkdir(exist_ok=True)
        root.joinpath('params.json').write_text(
            json.dumps(vars(args), indent=True, sort_keys=True))
        utils.train(
            args,
            model,
            criterion,
            train_loader=train_loader,
            valid_loader=valid_loader,
            save_predictions=save_predictions,
            is_classification=True,
            make_optimizer=lambda lr: SGD([
                {
                    'params': model.features.parameters(),
                    'lr': lr
                },
                {
                    'params': model.classifier.parameters(),
                    'lr': lr
                },
            ],
                                          nesterov=True,
                                          momentum=0.9),
        )
    elif args.mode == 'valid':
        utils.load_best_model(model, root, args.model_path)
        valid_loader = utils.make_loader(ClassificationDataset,
                                         args,
                                         valid_paths,
                                         coords,
                                         deterministic=True,
                                         **loader_kwargs)
        utils.validation(model,
                         criterion,
                         tqdm.tqdm(valid_loader, desc='Validation'),
                         is_classification=True)
    else:
        utils.load_best_model(model, root, args.model_path)
        if args.mode in {'predict_valid', 'predict_all_valid'}:
            if args.mode == 'predict_all_valid':
                # include all paths we did not train on (makes sense only with --limit)
                valid_paths = list(
                    set(valid_paths)
                    | (set(utils.labeled_paths()) - set(train_paths)))
            predict(model,
                    valid_paths,
                    out_path=args.out_path,
                    patch_size=args.patch_size,
                    batch_size=args.batch_size,
                    min_scale=args.min_scale,
                    max_scale=args.max_scale)
        elif args.mode == 'predict_test':
            assert False  # FIXME - use out_path too
            out_path = root.joinpath('test')
            out_path.mkdir(exist_ok=True)
            predicted = {p.stem.split('-')[0] for p in out_path.glob('*.npy')}
            test_paths = [
                p for p in utils.DATA_ROOT.joinpath('Test').glob('*.jpg')
                if p.stem not in predicted
            ]
            if args.pred_oddity is not None:
                assert args.pred_oddity in {0, 1}
                test_paths = [
                    p for p in test_paths
                    if int(p.stem) % 2 == args.pred_oddity
                ]
            predict(model,
                    test_paths,
                    out_path,
                    patch_size=args.patch_size,
                    batch_size=args.batch_size,
                    test_scale=args.test_scale,
                    is_test=True)
        else:
            parser.error('Unexpected mode {}'.format(args.mode))
Example #6
0
def train(data, *regs,
          save_to=None, concat_features=False, explain=False):
    coords = utils.load_coords()
    concated_xs = np.concatenate(data['xs'], axis=1)
    all_rmse, all_patch_rmse, all_baselines = [], [], []
    regs_name = ', '.join(type(reg).__name__ for reg in regs)
    fitted_regs = []
    expl_by_cls = defaultdict(list)
    for cls in range(utils.N_CLASSES):
        ids = data['ids'][cls]
        scales = data['scales'][cls]
        ys = data['ys'][cls]
        xs = input_features(concated_xs if concat_features else data['xs'][cls])
        # indices = np.array(sorted(range(len(ids)), key=lambda i: (scales[i], ids[i])))
        # ids, xs, ys = ids[indices], xs[indices], ys[indices]
        pred, fitted = train_predict(regs, xs, ys, ids)
        ys_by_id, pred_by_id = [], []
        unique_ids = sorted(set(ids))
        pred_by_id = get_pred_by_id(ids, pred, unique_ids)
        for img_id in unique_ids:
            try:
                ys_by_id.append((coords.loc[[img_id]].cls == cls).sum())
            except KeyError:
                ys_by_id.append(0)
        pred_by_id = round_prediction(pred_by_id)
        patch_rmse = np.sqrt(metrics.mean_squared_error(ys, pred))
        rmse = np.sqrt(metrics.mean_squared_error(ys_by_id, pred_by_id))
        baseline_rmse = np.sqrt(metrics.mean_squared_error(
            cross_val_predict(DummyRegressor(), [[0]] * len(ys_by_id), ys_by_id, cv=5),
            ys_by_id))
        print('cls {}, patch mean {:.3f}, patch RMSE {:.3f}, '
              'image mean {:.2f}, image RMSE {:.2f}, baseline RMSE {:.2f}'
              .format(cls, np.mean(ys), patch_rmse,
                      np.mean(ys_by_id), rmse, baseline_rmse))
        all_rmse.append(rmse)
        all_patch_rmse.append(patch_rmse)
        all_baselines.append(baseline_rmse)
        if save_to:
            fitted_regs.append(fitted)
        if explain:
            for reg in fitted:
                expl = eli5.explain_weights(reg, feature_names=FEATURE_NAMES)
                expl_by_cls[cls].append(expl)
                print(type(reg).__name__, format_as_text(
                    expl, show=('method', 'targets', 'feature_importances')))
    print('{} with {} features: mean patch RMSE {:.3f}, mean image RMSE {:.2f}, '
          'mean baseline RMSE {:.2f}'
          .format(regs_name, ', '.join(FEATURE_NAMES),
                  np.mean(all_patch_rmse), np.mean(all_rmse),
                  np.mean(all_baselines)))
    if save_to:
        joblib.dump(fitted_regs, save_to)
        print('Saved to', save_to)

    if explain:
        dfs = []
        for cls, expls in expl_by_cls.items():
            for expl in expls:
                df = eli5.format_as_dataframe(expl)
                df['cls'] = cls
                df['estimator'] = expl.estimator.split('(')[0]
                dfs.append(df)
        df = pd.concat(dfs)
        df.reset_index(inplace=True)
        df['feature'] = df['index']
        del df['index']
        df = df[['feature', 'cls', 'estimator', 'std', 'weight']]
        df.to_csv('feature_importances.csv', index=None)