Esempio n. 1
0
def run_server(prediction_fn, gt_fn):
    submission = load_gt(prediction_fn, rle_key='EncodedPixels')
    gt = load_gt(gt_fn)

    def compute_score(key):
        yt = DicomDataset.rles_to_mask(gt[key], merge_masks=True)
        yp = DicomDataset.rles_to_mask(submission[key], merge_masks=True)
        return score(yt, yp)

    scores = []
    keys = list(submission)

    with ThreadPoolExecutor(1) as e:
        scores = list(tqdm(e.map(compute_score, keys), total=len(keys)))

    empty_score = np.sum([s[0] for s in scores if s[1] == 'empty'])
    num_empty = sum(1 for s in scores if s[1] == 'empty')
    num_empty_pred = sum(1 for s in scores if s[-1] == 'empty')
    num_non_empty_pred = sum(1 for s in scores if s[-1] == 'non-empty')
    non_empty_score = np.sum([s[0] for s in scores if s[1] == 'non-empty'])
    num_non_empty = len(scores) - num_empty
    final_score = np.sum([s[0] for s in scores]) / len(scores)

    print("[GT: %5d | P: %5d] %012s %.4f | %.4f" %
          (num_empty, num_empty_pred, 'Empty: ', empty_score / num_empty,
           empty_score / len(scores)))
    print("[GT: %5d | P: %5d] %012s %.4f | %.4f" %
          (num_non_empty, num_non_empty_pred, 'Non-Empty: ',
           non_empty_score / num_non_empty, non_empty_score / len(scores)))
    print("[%5d] Final: %.4f" % (len(scores), final_score))
    return final_score
def main(config):
    seed_all()
    train_image_fns = sorted(glob(os.path.join(config.train_dir, '*/*/*.dcm')))
    test_image_fns = sorted(glob(os.path.join(config.test_dir, '*/*/*.dcm')))

    # assert len(train_image_fns) == 10712
    # assert len(test_image_fns) == 1377

    gt = load_gt(config.train_rle)
    # create folds
    np.random.shuffle(train_image_fns)

    folds = np.arange(len(train_image_fns)) % config.num_folds
    val_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold
    ]
    train_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold
    ]
    # remove not-used files:
    # https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/discussion/98478#latest-572385  # noqa
    train_image_fns = [
        fn for fn in train_image_fns if DicomDataset.fn_to_id(fn) in gt
    ]
    val_image_fns = [
        fn for fn in val_image_fns if DicomDataset.fn_to_id(fn) in gt
    ]

    print("VAL: ", len(val_image_fns), val_image_fns[0])
    print("TRAIN: ", len(train_image_fns), train_image_fns[0])
    if config.submit_val:
        test_image_fns = val_image_fns

    test_ds = DicomDataset(test_image_fns,
                           gt_rles=gt,
                           height=config.height,
                           width=config.height)
    test_ds.cache()
    test_loader = td.DataLoader(test_ds,
                                batch_size=config.batch_size,
                                shuffle=False,
                                num_workers=0,
                                pin_memory=False,
                                drop_last=False)

    model = FPNSegmentation(config.slug)
    print("Loading: %s" % config.weights)
    r = model.load_state_dict(th.load(config.weight))
    from IPython import embed
    embed()
    model = model.to(config.device).float()
    # model = apex.amp.initialize(model, opt_level="O1")
    model.eval()
    sub = create_submission(model,
                            test_loader,
                            config,
                            pred_zip=config.pred_zip,
                            tta=False)
    sub.to_csv(config.submission_fn, index=False)
    print("Wrote to %s" % config.submission_fn)
Esempio n. 3
0
def main(config):
    seed_all()
    train_image_fns = sorted(glob(os.path.join(config.train_dir, '*.jpg')))
    test_image_fns = sorted(glob(os.path.join(config.test_dir, '*.jpg')))

    gt, label_to_int = load_gt(config.train_csv)
    int_to_label = {v: k for k, v in label_to_int.items()}
    # create folds
    np.random.shuffle(train_image_fns)

    folds = np.arange(len(train_image_fns)) % config.num_folds
    val_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold
    ]
    train_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold
    ]

    # TODO: drop empty images <- is this helpful?
    train_image_fns = [
        fn for fn in train_image_fns if KuzushijiDataset.fn_to_id(fn) in gt
    ]
    val_image_fns = [
        fn for fn in val_image_fns if KuzushijiDataset.fn_to_id(fn) in gt
    ]

    print("VAL: ", len(val_image_fns), val_image_fns[123])
    print("TRAIN: ", len(train_image_fns), train_image_fns[456])
    if config.submit_val:
        test_image_fns = val_image_fns

    test_ds = MultiScaleInferenceKuzushijiDataset(test_image_fns, 1536, 1536,
                                                  config.scales)

    test_loader = td.DataLoader(test_ds,
                                batch_size=config.batch_size,
                                shuffle=False,
                                num_workers=0,
                                pin_memory=False,
                                drop_last=False)

    model = FPNSegmentation(config.slug, pretrained=False)
    print("Loading: %s" % config.weight)
    model.load_state_dict(th.load(config.weight))
    model = model.to(config.device)
    # model = apex.amp.initialize(model, opt_level="O1")
    model.eval()
    sub = create_submission(model,
                            test_loader,
                            int_to_label,
                            config,
                            pred_zip=config.pred_zip,
                            tta=config.tta)
    sub.to_csv(config.submission_fn, index=False)
    print("Wrote to %s" % config.submission_fn)
def main(config):
    os.makedirs('cache', exist_ok=True)
    os.makedirs(config.logdir, exist_ok=True)
    print("Logging to: %s" % config.logdir)
    if not os.path.exists(config.train_dir):
        print("KERNEL ENV")
        config.train_dicom_dir = '../input/siim-train-test/siim/dicom-images-train'
        config.test_dicom_dir = '../input/siim-train-test/siim/dicom-images-test'

        config.train_dir = '../input/l2-images/l2-images/l2-images-train'
        config.test_dir = '../input/l2-images/l2-images/l2-images-test'

        config.sample_submission = '../input/siim-acr-pneumothorax-segmentation/' \
            'sample_submission.csv'
        config.train_rle = '../input/siim-train-test/siim/train-rle.csv'

    train_image_fns = sorted(glob(os.path.join(config.train_dir, '*.png')))
    test_image_fns = sorted(glob(os.path.join(config.test_dir, '*.png')))

    assert len(train_image_fns) == 10675, len(train_image_fns)
    assert len(test_image_fns) in (1372, 1377), len(test_image_fns)

    gt = load_gt(config.train_rle)
    # create folds
    if not config.stratify:
        # random folds
        np.random.shuffle(train_image_fns)
    else:
        # folds stratified by mask size
        train_mask_sizes = [
            L2DicomDataset.rles_to_mask(gt[L2DicomDataset.fn_to_id(fn)]).sum()
            for fn in tqdm(train_image_fns)
        ]
        sorted_inds = [
            k for k in sorted(range(len(train_image_fns)),
                              key=lambda k: train_mask_sizes[k])
        ]
        train_image_fns = [train_image_fns[k] for k in sorted_inds]

    folds = np.arange(len(train_image_fns)) % config.num_folds
    val_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold
    ]
    train_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold
    ]
    # remove not-used files:
    # https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/discussion/98478#latest-572385  # noqa
    train_image_fns = [
        fn for fn in train_image_fns if L2DicomDataset.fn_to_id(fn) in gt
    ]
    val_image_fns = [
        fn for fn in val_image_fns if L2DicomDataset.fn_to_id(fn) in gt
    ]

    if config.drop_empty:
        # remove empty masks from training data
        non_empty_gt = {k: v for k, v in gt.items() if v[0] != ' -1'}
        train_image_fns = [
            fn for fn in train_image_fns
            if L2DicomDataset.fn_to_id(fn) in non_empty_gt
        ]
        print("[Non-EMPTY] TRAIN: ", len(train_image_fns),
              os.path.basename(train_image_fns[0]))

    print("VAL: ", len(val_image_fns), os.path.basename(val_image_fns[0]))
    print("TRAIN: ", len(train_image_fns),
          os.path.basename(train_image_fns[0]))

    train_ds = L2DicomDataset(train_image_fns,
                              gt_rles=gt,
                              height=config.height,
                              width=config.height,
                              to_ram=True,
                              augment=True,
                              write_cache=not config.is_kernel,
                              train_dicom_dir=config.train_dicom_dir,
                              test_dicom_dir=config.test_dicom_dir)
    val_ds = L2DicomDataset(val_image_fns,
                            gt_rles=gt,
                            height=config.height,
                            width=config.height,
                            to_ram=True,
                            write_cache=not config.is_kernel,
                            train_dicom_dir=config.train_dicom_dir,
                            test_dicom_dir=config.test_dicom_dir)

    val_loader = data.DataLoader(val_ds,
                                 batch_size=config.batch_size,
                                 shuffle=False,
                                 num_workers=config.num_workers,
                                 pin_memory=config.pin,
                                 drop_last=False)

    model = FPNSegmentation(config.slug, num_input_channels=2)
    if config.weight is not None:
        model.load_state_dict(th.load(config.weight))
    model = model.to(config.device)

    optimizer = th.optim.Adam(model.parameters(),
                              lr=config.lr,
                              weight_decay=config.weight_decay)

    if config.apex:
        model, optimizer = apex.amp.initialize(model,
                                               optimizer,
                                               opt_level="O1",
                                               verbosity=0)

    updates_per_epoch = len(train_ds) // config.batch_size
    num_updates = int(config.epochs * updates_per_epoch)
    scheduler = WarmupLinearSchedule(warmup=config.warmup, t_total=num_updates)

    # training loop
    smooth = 0.1
    best_dice = 0.0
    best_fn = None
    global_step = 0
    for epoch in range(config.epochs):
        smooth_loss = None
        smooth_accuracy = None
        model.train()
        train_loader = data.DataLoader(train_ds,
                                       batch_size=config.batch_size,
                                       shuffle=True,
                                       num_workers=config.num_workers,
                                       pin_memory=config.pin,
                                       drop_last=True)
        progress = tqdm(total=len(train_ds), smoothing=0.01)
        for i, (X, y_true) in enumerate(train_loader):
            X = X.to(config.device)
            y_true = y_true.to(config.device)
            y_pred = model(X)
            loss = siim_loss(y_true, y_pred, weights=None)
            if config.apex:
                with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            lr_this_step = None
            if (i + 1) % config.accumulation_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                lr_this_step = config.lr * scheduler.get_lr(
                    global_step, config.warmup)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                global_step += 1

            smooth_loss = loss.item() if smooth_loss is None else \
                smooth * loss.item() + (1. - smooth) * smooth_loss
            # print((y_true >= 0.5).sum().item())
            accuracy = th.mean(
                ((y_pred >= 0.5) == (y_true >= 0.5)).to(th.float)).item()
            smooth_accuracy = accuracy if smooth_accuracy is None else \
                smooth * accuracy + (1. - smooth) * smooth_accuracy
            progress.set_postfix(
                loss='%.4f' % smooth_loss,
                accuracy='%.4f' % (smooth_accuracy),
                lr='%.6f' %
                (config.lr if lr_this_step is None else lr_this_step))
            progress.update(len(X))

        # validation loop
        model.eval()
        thresholds = np.arange(0.1, 0.7, 0.1)
        dice_coeffs = [[] for _ in range(len(thresholds))]
        progress = tqdm(enumerate(val_loader), total=len(val_loader))
        with th.no_grad():
            for i, (X, y_trues) in progress:
                X = X.to(config.device)
                y_trues = y_trues.to(config.device)
                y_preds = model(X)
                for yt, yp in zip(y_trues, y_preds):
                    yt = (yt.squeeze().cpu().numpy() >= 0.5).astype('uint8')
                    yp = yp.squeeze().cpu().numpy()
                    for dind, threshold in enumerate(thresholds):
                        yp_ = (yp >= threshold).astype(np.uint8)
                        sc = score(yt, yp_)
                        dice_coeffs[dind].append(sc)

        best_threshold_ind = -1
        dice_coeff = -1
        for dind, threshold in enumerate(thresholds):
            dc = np.mean(
                [x[0] for x in dice_coeffs[dind] if x[1] == 'non-empty'])
            # progress.write("Dice @%.2f: %.4f" % (threshold, dc))
            if dc > dice_coeff:
                dice_coeff = dc
                best_threshold_ind = dind

        dice_coeffs = dice_coeffs[best_threshold_ind]
        num_empty = sum(1 for x in dice_coeffs if x[1] == 'empty')
        num_total = len(dice_coeffs)
        num_non_empty = num_total - num_empty
        empty_sum = np.sum([d[0] for d in dice_coeffs if d[1] == 'empty'])
        non_empty_sum = np.sum(
            [d[0] for d in dice_coeffs if d[1] == 'non-empty'])
        dice_coeff_empty = empty_sum / num_empty
        dice_coeff_non_empty = non_empty_sum / num_non_empty
        progress.write(
            '[Empty: %d]: %.3f | %.3f, [Non-Empty: %d]: %.3f | %.3f' %
            (num_empty, dice_coeff_empty, empty_sum / num_total, num_non_empty,
             dice_coeff_non_empty, non_empty_sum / num_total))
        dice_coeff = float(dice_coeff)
        summary_str = 'f%02d-ep-%04d-val_dice-%.4f@%.2f' % (
            config.fold, epoch, dice_coeff, thresholds[best_threshold_ind])
        progress.write(summary_str)
        if dice_coeff > best_dice:
            weight_fn = os.path.join(config.logdir, summary_str + '.pth')
            th.save(model.state_dict(), weight_fn)
            best_dice = dice_coeff
            best_fn = weight_fn
            fns = sorted(
                glob(os.path.join(config.logdir, 'f%02d-*.pth' % config.fold)))
            for fn in fns[:-config.n_keep]:
                os.remove(fn)

    # create submission
    test_ds = L2DicomDataset(test_image_fns,
                             height=config.height,
                             width=config.height,
                             write_cache=not config.is_kernel,
                             train_dicom_dir=config.train_dicom_dir,
                             test_dicom_dir=config.test_dicom_dir)
    test_loader = data.DataLoader(test_ds,
                                  batch_size=config.batch_size,
                                  shuffle=False,
                                  num_workers=0,
                                  pin_memory=False,
                                  drop_last=False)
    if best_fn is not None:
        model.load_state_dict(th.load(best_fn))
    model.eval()
    sub = create_submission(model,
                            test_loader,
                            test_image_fns,
                            config,
                            pred_zip=config.pred_zip)
    sub.to_csv(config.submission_fn, index=False)
    print("Wrote to: %s" % config.submission_fn)

    # create val submission
    val_fn = config.submission_fn.replace('.csv', '_VAL.csv')
    model.eval()
    sub = []
    sub = create_submission(model,
                            val_loader,
                            val_image_fns,
                            config,
                            pred_zip=config.pred_zip.replace(
                                '.zip', '_VAL.zip'))
    sub.to_csv(val_fn, index=False)
    print("Wrote to: %s" % val_fn)
Esempio n. 5
0
def main(config):
    seed_all()
    os.makedirs('cache', exist_ok=True)
    os.makedirs(config.logdir, exist_ok=True)
    print("Logging to: %s" % config.logdir)
    src_files = sorted(glob('*.py'))
    for src_fn in src_files:
        dst_fn = os.path.join(config.logdir, src_fn)
        copyfile(src_fn, dst_fn)

    train_image_fns = sorted(glob(os.path.join(config.train_dir, '*.jpg')))
    test_image_fns = sorted(glob(os.path.join(config.test_dir, '*.jpg')))

    assert len(train_image_fns) == 3881
    assert len(test_image_fns) == 4150

    gt, label_to_int = load_gt(config.train_rle)
    int_to_label = {v: k for k, v in label_to_int.items()}
    # create folds
    np.random.shuffle(train_image_fns)

    if config.subset > 0:
        train_image_fns = train_image_fns[:config.subset]

    folds = np.arange(len(train_image_fns)) % config.num_folds
    val_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold
    ]
    train_image_fns = [
        fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold
    ]

    if config.add_val:
        print("Training on validation set")
        train_image_fns = train_image_fns + val_image_fns[:]

    print(len(val_image_fns), len(train_image_fns))

    # TODO: drop empty images <- is this helpful?
    train_image_fns = [
        fn for fn in train_image_fns if KuzushijiDataset.fn_to_id(fn) in gt
    ]
    val_image_fns = [
        fn for fn in val_image_fns if KuzushijiDataset.fn_to_id(fn) in gt
    ]

    print("VAL: ", len(val_image_fns), val_image_fns[123])
    print("TRAIN: ", len(train_image_fns), train_image_fns[456])

    train_ds = KuzushijiDataset(train_image_fns,
                                gt_boxes=gt,
                                label_to_int=label_to_int,
                                augment=True)
    val_ds = KuzushijiDataset(val_image_fns,
                              gt_boxes=gt,
                              label_to_int=label_to_int)

    if config.cache:
        train_ds.cache()
        val_ds.cache()

    val_loader = data.DataLoader(val_ds,
                                 batch_size=config.batch_size // 8,
                                 shuffle=False,
                                 num_workers=config.num_workers,
                                 pin_memory=config.pin,
                                 drop_last=False)

    model = FPNSegmentation(config.slug)
    if config.weight is not None:
        print("Loading: %s" % config.weight)
        model.load_state_dict(th.load(config.weight))
    model = model.to(config.device)

    no_decay = ['mean', 'std', 'bias'] + ['.bn%d.' % i for i in range(100)]
    grouped_parameters = [{
        'params': [],
        'weight_decay': config.weight_decay
    }, {
        'params': [],
        'weight_decay': 0.0
    }]
    for n, p in model.named_parameters():
        if not any(nd in n for nd in no_decay):
            # print("Decay: %s" % n)
            grouped_parameters[0]['params'].append(p)
        else:
            # print("No Decay: %s" % n)
            grouped_parameters[1]['params'].append(p)
    optimizer = AdamW(grouped_parameters, lr=config.lr)

    if config.apex:
        model, optimizer = apex.amp.initialize(model,
                                               optimizer,
                                               opt_level="O1",
                                               verbosity=0)

    updates_per_epoch = len(train_ds) // config.batch_size
    num_updates = int(config.epochs * updates_per_epoch)
    scheduler = WarmupLinearSchedule(warmup=config.warmup, t_total=num_updates)

    # training loop
    smooth = 0.1
    best_acc = 0.0
    best_fn = None
    global_step = 0
    for epoch in range(1, config.epochs + 1):
        smooth_loss = None
        smooth_accuracy = None
        model.train()
        train_loader = data.DataLoader(train_ds,
                                       batch_size=config.batch_size,
                                       shuffle=True,
                                       num_workers=config.num_workers,
                                       pin_memory=config.pin,
                                       drop_last=True)
        progress = tqdm(total=len(train_ds), smoothing=0.01)
        if True:
            for i, (X, fns, hm, centers, classes) in enumerate(train_loader):
                X = X.to(config.device).float()
                hm = hm.to(config.device)
                centers = centers.to(config.device)
                classes = classes.to(config.device)
                hm_pred, classes_pred = model(X, centers=centers)
                loss = kuzushiji_loss(hm, centers, classes, hm_pred,
                                      classes_pred)
                if config.apex:
                    with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                lr_this_step = None
                if (i + 1) % config.accumulation_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_this_step = config.lr * scheduler.get_lr(
                        global_step, config.warmup)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    global_step += 1

                smooth_loss = loss.item() if smooth_loss is None else \
                    smooth * loss.item() + (1. - smooth) * smooth_loss
                # print((y_true >= 0.5).sum().item())
                accuracy = th.mean(
                    ((th.sigmoid(hm_pred) >= 0.5) == (hm == 1)).to(
                        th.float)).item()
                smooth_accuracy = accuracy if smooth_accuracy is None else \
                    smooth * accuracy + (1. - smooth) * smooth_accuracy
                progress.set_postfix(
                    ep='%d/%d' % (epoch, config.epochs),
                    loss='%.4f' % smooth_loss,
                    accuracy='%.4f' % (smooth_accuracy),
                    lr='%.6f' %
                    (config.lr if lr_this_step is None else lr_this_step))
                progress.update(len(X))

        # skip validation
        if epoch not in [10, 20, 30, 40, 50]:
            if 1 < epoch <= 65:
                continue

        # validation loop
        model.eval()
        progress = tqdm(enumerate(val_loader), total=len(val_loader))
        hm_correct, classes_correct = 0, 0
        num_hm, num_classes = 0, 0
        with th.no_grad():
            for i, (X, fns, hm, centers, classes) in progress:
                X = X.to(config.device).float()
                hm = hm.cuda()
                centers = centers.cuda()
                classes = classes.cuda()
                hm_pred, classes_pred = model(X)
                hm_pred = th.sigmoid(hm_pred)
                classes_pred = th.nn.functional.softmax(classes_pred, 1)
                hm_cuda = hm.cuda()
                # PyTorch 1.2 has `bool`
                if hasattr(hm_cuda, 'bool'):
                    hm_cuda = hm_cuda.bool()
                hm_correct += (hm_cuda == (hm_pred >=
                                           0.5)).float().sum().item()
                num_hm += np.prod(hm.shape)
                num_samples = len(X)
                for sample_ind in range(num_samples):
                    center_mask = centers[sample_ind, :, 0] != -1
                    per_image_letters = center_mask.sum().item()
                    if per_image_letters == 0:
                        continue
                    num_classes += per_image_letters
                    centers_per_img = centers[sample_ind][center_mask]
                    classes_per_img = classes[sample_ind][center_mask]
                    classes_per_img_pred = classes_pred[
                        sample_ind][:, centers_per_img[:, 1],
                                    centers_per_img[:, 0]].argmax(0)
                    classes_correct += (
                        classes_per_img_pred == classes_per_img).sum().item()
                    num_classes += per_image_letters

        val_hm_acc = hm_correct / num_hm
        val_classes_acc = classes_correct / num_classes
        summary_str = 'f%02d-ep-%04d-val_hm_acc-%.4f-val_classes_acc-%.4f' % (
            config.fold, epoch, val_hm_acc, val_classes_acc)

        progress.write(summary_str)
        if val_classes_acc >= best_acc:
            weight_fn = os.path.join(config.logdir, summary_str + '.pth')
            progress.write("New best: %s" % weight_fn)
            th.save(model.state_dict(), weight_fn)
            best_acc = val_classes_acc
            best_fn = weight_fn
            fns = sorted(
                glob(os.path.join(config.logdir, 'f%02d-*.pth' % config.fold)))
            for fn in fns[:-config.n_keep]:
                os.remove(fn)

    # create submission
    test_ds = KuzushijiDataset(test_image_fns)
    test_loader = data.DataLoader(test_ds,
                                  batch_size=config.batch_size // 8,
                                  shuffle=False,
                                  num_workers=config.num_workers,
                                  pin_memory=False,
                                  drop_last=False)
    if best_fn is not None:
        model.load_state_dict(th.load(best_fn))
    model.eval()
    sub = create_submission(model,
                            test_loader,
                            int_to_label,
                            config,
                            pred_zip=config.pred_zip)
    sub.to_csv(config.submission_fn, index=False)
    print("Wrote to: %s" % config.submission_fn)

    # create val submission
    val_fn = config.submission_fn.replace('.csv', '_VAL.csv')
    model.eval()
    sub = []
    sub = create_submission(model,
                            val_loader,
                            int_to_label,
                            config,
                            pred_zip=config.pred_zip.replace(
                                '.zip', '_VAL.zip'))
    sub.to_csv(val_fn, index=False)
    print("Wrote to: %s" % val_fn)
Esempio n. 6
0
def main(config):
  seed_all()
  os.makedirs('cache', exist_ok=True)
  os.makedirs(config.logdir, exist_ok=True)
  print("Logging to: %s" % config.logdir)
  src_files = sorted(glob('*.py'))
  for src_fn in src_files:
    dst_fn = os.path.join(config.logdir, src_fn)
    copyfile(src_fn, dst_fn)

  train_image_fns = sorted(glob(os.path.join(config.train_dir, '*/*/*.dcm')))
  test_image_fns = sorted(glob(os.path.join(config.test_dir, '*/*/*.dcm')))

  # assert len(train_image_fns) == 10712
  # assert len(test_image_fns) == 1377

  gt = load_gt(config.train_rle)
  # create folds
  np.random.shuffle(train_image_fns)

  if config.subset > 0:
    train_image_fns = train_image_fns[:config.subset]

  folds = np.arange(len(train_image_fns)) % config.num_folds
  val_image_fns = [fn for k, fn in enumerate(train_image_fns)
      if folds[k] == config.fold]
  train_image_fns = [fn for k, fn in enumerate(train_image_fns)
      if folds[k] != config.fold]
  # remove not-used files:
  # https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/discussion/98478#latest-572385  # noqa
  train_image_fns = [fn for fn in train_image_fns
      if DicomDataset.fn_to_id(fn) in gt]
  val_image_fns = [fn for fn in val_image_fns
      if DicomDataset.fn_to_id(fn) in gt]

  print("VAL: ", len(val_image_fns), os.path.basename(val_image_fns[0]))
  print("TRAIN: ", len(train_image_fns), os.path.basename(train_image_fns[0]))

  train_ds = DicomDataset(train_image_fns, gt_rles=gt, augment=True)
  val_ds = DicomDataset(val_image_fns, gt_rles=gt)

  if config.cache:
    train_ds.cache()
    val_ds.cache()

  val_loader = data.DataLoader(val_ds, batch_size=config.batch_size,
                               shuffle=False, num_workers=config.num_workers,
                               pin_memory=config.pin, drop_last=False)

  model = FPNSegmentation(config.slug, ema=config.ema)
  if config.weight is not None:
    print("Loading: %s" % config.weight)
    model.load_state_dict(th.load(config.weight))
  model = model.to(config.device)

  no_decay = ['mean', 'std', 'bias'] + ['.bn%d.' % i for i in range(100)]
  grouped_parameters = [{'params': [], 'weight_decay': config.weight_decay},
      {'params': [], 'weight_decay': 0.0}]
  for n, p in model.named_parameters():
    if not any(nd in n for nd in no_decay):
      print("Decay: %s" % n)
      grouped_parameters[0]['params'].append(p)
    else:
      print("No Decay: %s" % n)
      grouped_parameters[1]['params'].append(p)
  optimizer = AdamW(grouped_parameters, lr=config.lr)

  if config.apex:
    model, optimizer = apex.amp.initialize(model, optimizer, opt_level="O1",
                                           verbosity=0)

  updates_per_epoch = len(train_ds) // config.batch_size
  num_updates = int(config.epochs * updates_per_epoch)
  scheduler = WarmupLinearSchedule(warmup=config.warmup, t_total=num_updates)

  # training loop
  smooth = 0.1
  best_dice = 0.0
  best_fn = None
  global_step = 0
  for epoch in range(1, config.epochs + 1):
    smooth_loss = None
    smooth_accuracy = None
    model.train()
    train_loader = data.DataLoader(train_ds, batch_size=config.batch_size,
                                   shuffle=True, num_workers=config.num_workers,
                                   pin_memory=config.pin, drop_last=True)
    progress = tqdm(total=len(train_ds), smoothing=0.01)
    for i, (X, _, y_true) in enumerate(train_loader):
      X = X.to(config.device).float()
      y_true = y_true.to(config.device)
      y_pred = model(X)
      loss = siim_loss(y_true, y_pred, weights=None)
      if config.apex:
        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
          scaled_loss.backward()
      else:
        loss.backward()

      lr_this_step = None
      if (i + 1) % config.accumulation_step == 0:
        optimizer.step()
        optimizer.zero_grad()
        lr_this_step = config.lr * scheduler.get_lr(global_step, config.warmup)
        for param_group in optimizer.param_groups:
          param_group['lr'] = lr_this_step
        global_step += 1

      smooth_loss = loss.item() if smooth_loss is None else \
          smooth * loss.item() + (1. - smooth) * smooth_loss
      # print((y_true >= 0.5).sum().item())
      accuracy = th.mean(((y_pred >= 0.5) == (y_true == 1)).to(
          th.float)).item()
      smooth_accuracy = accuracy if smooth_accuracy is None else \
          smooth * accuracy + (1. - smooth) * smooth_accuracy
      progress.set_postfix(ep='%d/%d' % (epoch, config.epochs),
            loss='%.4f' % smooth_loss, accuracy='%.4f' %
            (smooth_accuracy), lr='%.6f' % (config.lr if lr_this_step is None
              else lr_this_step))
      progress.update(len(X))

    if epoch <= 12:
      continue
    # validation loop
    model.eval()
    thresholds = [0.1, 0.2]
    dice_coeffs = [[] for _ in range(len(thresholds))]
    progress = tqdm(enumerate(val_loader), total=len(val_loader))
    with th.no_grad():
      for i, (X, _, y_trues) in progress:
        X = X.to(config.device).float()
        y_trues = y_trues.to(config.device)
        y_preds = model(X)
        y_preds_flip = th.flip(model(th.flip(X, (-1, ))), (-1, ))
        y_preds = 0.5 * (y_preds + y_preds_flip)

        y_trues = y_trues.cpu().numpy()
        y_preds = y_preds.cpu().numpy()
        for yt, yp in zip(y_trues, y_preds):
          yt = (yt.squeeze() >= 0.5).astype('uint8')
          yp = yp.squeeze()
          for dind, threshold in enumerate(thresholds):
            yp_ = (yp >= threshold).astype(np.uint8)
            sc = score(yt, yp_)
            dice_coeffs[dind].append(sc)

    best_threshold_ind = -1
    dice_coeff = -1
    for dind, threshold in enumerate(thresholds):
      dc = np.mean([x[0] for x in dice_coeffs[dind] if x[1] == 'non-empty'])
      # progress.write("Dice @%.2f: %.4f" % (threshold, dc))
      if dc > dice_coeff:
        dice_coeff = dc
        best_threshold_ind = dind

    dice_coeffs = dice_coeffs[best_threshold_ind]
    num_empty = sum(1 for x in dice_coeffs if x[1] == 'empty')
    num_total = len(dice_coeffs)
    num_non_empty = num_total - num_empty
    empty_sum = np.sum([d[0] for d in dice_coeffs if d[1] == 'empty'])
    non_empty_sum = np.sum([d[0] for d in dice_coeffs if d[1] == 'non-empty'])
    dice_coeff_empty = empty_sum / num_empty
    dice_coeff_non_empty = non_empty_sum / num_non_empty
    progress.write('[Empty: %d]: %.3f | %.3f, [Non-Empty: %d]: %.3f | %.3f' % (
        num_empty, dice_coeff_empty, empty_sum / num_total,
        num_non_empty, dice_coeff_non_empty, non_empty_sum / num_total))
    dice_coeff = float(dice_coeff)
    summary_str = 'f%02d-ep-%04d-val_dice-%.4f@%.2f' % (config.fold, epoch,
        dice_coeff, thresholds[best_threshold_ind])
    progress.write(summary_str)
    if dice_coeff > best_dice:
      weight_fn = os.path.join(config.logdir, summary_str + '.pth')
      th.save(model.state_dict(), weight_fn)
      best_dice = dice_coeff
      best_fn = weight_fn
      fns = sorted(glob(os.path.join(config.logdir, 'f%02d-*.pth' %
          config.fold)))
      for fn in fns[:-config.n_keep]:
        os.remove(fn)

  # create submission
  test_ds = DicomDataset(test_image_fns)
  test_loader = data.DataLoader(test_ds, batch_size=config.batch_size,
                               shuffle=False, num_workers=0,
                               pin_memory=False, drop_last=False)
  if best_fn is not None:
    model.load_state_dict(th.load(best_fn))
  model.eval()
  sub = create_submission(model, test_loader, config, pred_zip=config.pred_zip)
  sub.to_csv(config.submission_fn, index=False)
  print("Wrote to: %s" % config.submission_fn)

  # create val submission
  val_fn = config.submission_fn.replace('.csv', '_VAL.csv')
  model.eval()
  sub = []
  sub = create_submission(model, val_loader, config,
      pred_zip=config.pred_zip.replace('.zip', '_VAL.zip'))
  sub.to_csv(val_fn, index=False)
  print("Wrote to: %s" % val_fn)
Esempio n. 7
0
    'Both_SEG_logdir_073_f00/Both_SEG_sub_73_f00_VAL.csv',
    'Both_SEG_logdir_073_f01/Both_SEG_sub_73_f01_VAL.csv',
    'Both_SEG_logdir_073_f02/Both_SEG_sub_73_f02_VAL.csv',
    'Both_SEG_logdir_073_f03/Both_SEG_sub_73_f03_VAL.csv',
    'Both_SEG_logdir_073_f04/Both_SEG_sub_73_f04_VAL.csv',
]

with open('nih_ptx_hashes.p', 'rb') as f:
    nih = pickle.load(f)
    ptx_hashes = set(nih.values())

with open('current_ptx_hashes.p', 'rb') as f:
    hh = pickle.load(f)

preds = [pd.read_csv(fn) for fn in oof_fns]
gts = load_gt('train-rle.csv')
np.random.seed(123)
pred = pd.concat(preds)
pred = {k: v for k, v in zip(pred['ImageId'], pred['EncodedPixels'])}
train_image_fns = sorted(glob(os.path.join('dicom-images-train', '*/*/*.dcm')))
np.random.shuffle(train_image_fns)
num_fp, num_fp_in_ptx, num_fp_not_in_ptx = 0, 0, 0
image_ids, rles = [], []
num_missing = 0
for ind, fn in tqdm(enumerate(train_image_fns), total=len(train_image_fns)):
    img_id = DicomDataset.fn_to_id(fn)
    try:
        p = pred[img_id]
    except:
        num_missing += 1
        print(img_id)
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
tqdm.monitor_interval = 0  # noqa
from data import DicomDataset, load_mask_counts, load_gt

sub_fn = 'x_ensemble/CLF_ADJUST_Both_ENS_0024.csv'
out_fn = os.path.join(os.path.dirname(sub_fn),
                      'ADJUST_V2_' + os.path.basename(sub_fn))
sub = load_gt(sub_fn, rle_key='EncodedPixels')
val = '_VAL' in sub_fn
if val:
    ids = load_mask_counts('train-rle.csv')
else:
    ids = load_mask_counts('sample_submission.csv')
adjusted_sub = {'ImageId': [], 'EncodedPixels': []}

num_removed = 0
num_added = 0
num_missed = 0
for image_id in tqdm(sub):
    rles = sub[image_id]
    num_masks = ids.get(image_id, 1)
    masks = DicomDataset.rles_to_mask(rles, merge_masks=False)
    num_pred = masks.max()
    if num_pred > num_masks:
        sizes = np.float32([(masks == i).sum()
                            for i in range(1, num_pred + 1)])
        inds = np.argsort(-sizes)[:num_masks]
        inds = [range(1, num_pred + 1)[ind] for ind in inds]
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fn', type=str)
    parser.add_argument('--show-empty', action='store_true')
    parser.add_argument('--seed', type=int, default=32)
    parser.add_argument('--height', type=int, default=1024)

    args = parser.parse_args()
    sub = pd.read_csv(args.fn)
    np.random.seed(args.seed)
    if ' EncodedPixels' in sub.columns:
        sub['EncodedPixels'] = sub[' EncodedPixels']
        sub = sub[['ImageId', 'EncodedPixels']]

    sub['EncodedPixels'] = sub['EncodedPixels'].apply(lambda x: x
                                                      if x != ' -1' else '-1')
    gt = load_gt('train-rle.csv')
    pred_gt = load_gt('sub_8730.csv', rle_key='EncodedPixels')
    for k, v in pred_gt.items():
        gt[k] = v
    train_fns = sorted(glob('dicom-images-train/*/*/*.dcm'))
    test_fns = sorted(glob('dicom-images-test/*/*/*.dcm'))
    all_fns = train_fns + test_fns
    id_to_fn = {DicomDataset.fn_to_id(fn): fn for fn in all_fns}
    sub_ = defaultdict(list)
    for iid, rle in zip(sub['ImageId'], sub['EncodedPixels']):
        sub_[iid].append(rle)
    sub = sub_
    num_mask = sum(1 for k, v in sub.items() if v[0] != '-1')
    num_one_mask = sum(1 for k, v in sub.items()
                       if v[0] != '-1' and len(v) == 1)
    num_more_mask = sum(1 for k, v in sub.items()
                        if v[0] != '-1' and len(v) >= 2)
    print("%d of %d have a mask" % (num_mask, len(sub)))
    print("%d have 1, %d 2 or more" % (num_one_mask, num_more_mask))
    img_ids = sorted(sub.keys())
    np.random.shuffle(img_ids)
    for img_id in img_ids:
        img_fn = id_to_fn[img_id]
        rles = sub[img_id]
        if not args.show_empty:
            if rles[0] == '-1':
                continue
        print("%d masks" % len(rles))
        dcm = pydicom.dcmread(img_fn)
        view = dcm.ViewPosition
        print(view)
        img = dcm.pixel_array
        mask = DicomDataset.rles_to_mask(rles, merge_masks=False)
        if args.height != 1024:
            img = cv2.resize(img, (args.height, args.height),
                             interpolation=cv2.INTER_NEAREST)
            mask = cv2.resize(mask, (args.height, args.height),
                              interpolation=cv2.INTER_NEAREST)

        gt_mask = None
        if img_id in gt:
            gt_rles = gt[img_id]
            gt_mask = DicomDataset.rles_to_mask(gt_rles, merge_masks=False)
            gt_mask = cv2.resize(gt_mask, (args.height, args.height),
                                 interpolation=cv2.INTER_NEAREST)

            if gt_mask.max() == 0:
                continue
        # for j in range(0, 512, 16):
        #   img[:, j] = 255
        #   img[j, :] = 255
        #   mask[:, j] = mask.max()
        #   mask[j, :] = mask.max()

        nc = 2 if gt_mask is None else 3
        plt.subplot(1, nc, 1)
        plt.title(os.path.splitext(img_id)[-1])
        plt.imshow(img, cmap='bone')
        plt.axis('off')
        plt.subplot(1, nc, 2)
        plt.title('PRED: ' + str(mask.max()))
        plt.imshow(mask, cmap='bone', alpha=0.4)
        plt.axis('off')
        if gt_mask is not None:
            vis = np.dstack([img.copy()] * 3)
            vis[gt_mask > 0] = (0, 255, 0)
            vis[mask > 0] = 0.3 * vis[mask > 0] + 0.7 * np.float32([255, 0, 0])
            plt.subplot(1, nc, 3)
            plt.title('%sGT: ' % ('PRED-' if img_id in pred_gt else 'GT-') +
                      str(gt_mask.max()))
            plt.imshow(vis, cmap='bone')

            plt.axis('off')
        plt.show()