def create_submission(model, loader, config, pred_zip=None, tta=True): if os.path.exists(pred_zip): os.remove(pred_zip) sub = [] model.eval() with th.no_grad(): for ret in tqdm(loader): X, fns = ret[:2] X = X.to(config.device).float() # that squeeze index is important! (in case of X.shape[0] == 1) y_pred = model(X).squeeze(1).cpu().numpy() if tta: y_pred_flip = th.flip(model(th.flip(X, (-1, ))), (-1, )).squeeze(1).cpu().numpy() y_pred = 0.5 * (y_pred + y_pred_flip) for j in range(len(y_pred)): img_id = DicomDataset.fn_to_id(fns[j]) yp = y_pred[j] if pred_zip is not None: pred_fn = img_id + '.png' yp_img = np.uint8(yp * 255) img_bytes = cv2.imencode('.png', yp_img)[1].tobytes() with ZipFile(pred_zip, 'a') as f: f.writestr(pred_fn, img_bytes) assert yp.shape == (1024, 1024), yp.shape # classify clf_mask = (yp >= config.p_clf).astype('uint8') if clf_mask.sum() == 0: sub.append((img_id, '-1')) continue # segment mask = (yp >= config.p_seg).astype('uint8') assert mask.shape == (1024, 1024), mask.shape if mask.sum() == 0: sub.append((img_id, '-1')) continue rle = DicomDataset.mask_to_rle(mask * 255) sub.append((img_id, rle)) image_ids = [s[0] for s in sub] encoded_pixels = [s[1] for s in sub] sub = pd.DataFrame({'ImageId': image_ids, 'EncodedPixels': encoded_pixels}) return sub
def main(config): seed_all() train_image_fns = sorted(glob(os.path.join(config.train_dir, '*/*/*.dcm'))) test_image_fns = sorted(glob(os.path.join(config.test_dir, '*/*/*.dcm'))) # assert len(train_image_fns) == 10712 # assert len(test_image_fns) == 1377 gt = load_gt(config.train_rle) # create folds np.random.shuffle(train_image_fns) folds = np.arange(len(train_image_fns)) % config.num_folds val_image_fns = [ fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold ] train_image_fns = [ fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold ] # remove not-used files: # https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/discussion/98478#latest-572385 # noqa train_image_fns = [ fn for fn in train_image_fns if DicomDataset.fn_to_id(fn) in gt ] val_image_fns = [ fn for fn in val_image_fns if DicomDataset.fn_to_id(fn) in gt ] print("VAL: ", len(val_image_fns), val_image_fns[0]) print("TRAIN: ", len(train_image_fns), train_image_fns[0]) if config.submit_val: test_image_fns = val_image_fns test_ds = DicomDataset(test_image_fns, gt_rles=gt, height=config.height, width=config.height) test_ds.cache() test_loader = td.DataLoader(test_ds, batch_size=config.batch_size, shuffle=False, num_workers=0, pin_memory=False, drop_last=False) model = FPNSegmentation(config.slug) print("Loading: %s" % config.weights) r = model.load_state_dict(th.load(config.weight)) from IPython import embed embed() model = model.to(config.device).float() # model = apex.amp.initialize(model, opt_level="O1") model.eval() sub = create_submission(model, test_loader, config, pred_zip=config.pred_zip, tta=False) sub.to_csv(config.submission_fn, index=False) print("Wrote to %s" % config.submission_fn)
from tqdm import tqdm import numpy as np from collections import defaultdict import pandas as pd from data import DicomDataset sub_fn = 'Both_SEG_logdir_037_f04/Both_SEG_logdir_037_f04/Both_SEG_sub_37_f04_VAL.csv' # noqa out_fn = os.path.join(os.path.dirname(sub_fn), 'MERGE_V3_' + os.path.basename(sub_fn)) sub = pd.read_csv(sub_fn) sub_ = defaultdict(list) for img_id, rle in zip(sub['ImageId'], sub['EncodedPixels']): sub_[img_id].append(rle) sub = sub_ image_ids, sub_rles = [], [] for img_id, rles in tqdm(sub.items()): mask = np.zeros((1024, 1024), dtype='uint8') for rle in rles: if '-1' in rle: continue m = DicomDataset.rles_to_mask([rle]) mask[m == 1] = 1 image_ids.append(img_id) sub_rles.append(DicomDataset.mask_to_rle(mask * 255)) sub = pd.DataFrame({'ImageId': image_ids, 'EncodedPixels': sub_rles}) sub.to_csv(out_fn, index=False, columns=['ImageId', 'EncodedPixels']) print("Wrote to: %s" % out_fn)
def main(config): seed_all() os.makedirs('cache', exist_ok=True) os.makedirs(config.logdir, exist_ok=True) print("Logging to: %s" % config.logdir) src_files = sorted(glob('*.py')) for src_fn in src_files: dst_fn = os.path.join(config.logdir, src_fn) copyfile(src_fn, dst_fn) train_image_fns = sorted(glob(os.path.join(config.train_dir, '*/*/*.dcm'))) test_image_fns = sorted(glob(os.path.join(config.test_dir, '*/*/*.dcm'))) # assert len(train_image_fns) == 10712 # assert len(test_image_fns) == 1377 gt = load_gt(config.train_rle) # create folds np.random.shuffle(train_image_fns) if config.subset > 0: train_image_fns = train_image_fns[:config.subset] folds = np.arange(len(train_image_fns)) % config.num_folds val_image_fns = [fn for k, fn in enumerate(train_image_fns) if folds[k] == config.fold] train_image_fns = [fn for k, fn in enumerate(train_image_fns) if folds[k] != config.fold] # remove not-used files: # https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/discussion/98478#latest-572385 # noqa train_image_fns = [fn for fn in train_image_fns if DicomDataset.fn_to_id(fn) in gt] val_image_fns = [fn for fn in val_image_fns if DicomDataset.fn_to_id(fn) in gt] print("VAL: ", len(val_image_fns), os.path.basename(val_image_fns[0])) print("TRAIN: ", len(train_image_fns), os.path.basename(train_image_fns[0])) train_ds = DicomDataset(train_image_fns, gt_rles=gt, augment=True) val_ds = DicomDataset(val_image_fns, gt_rles=gt) if config.cache: train_ds.cache() val_ds.cache() val_loader = data.DataLoader(val_ds, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers, pin_memory=config.pin, drop_last=False) model = FPNSegmentation(config.slug, ema=config.ema) if config.weight is not None: print("Loading: %s" % config.weight) model.load_state_dict(th.load(config.weight)) model = model.to(config.device) no_decay = ['mean', 'std', 'bias'] + ['.bn%d.' % i for i in range(100)] grouped_parameters = [{'params': [], 'weight_decay': config.weight_decay}, {'params': [], 'weight_decay': 0.0}] for n, p in model.named_parameters(): if not any(nd in n for nd in no_decay): print("Decay: %s" % n) grouped_parameters[0]['params'].append(p) else: print("No Decay: %s" % n) grouped_parameters[1]['params'].append(p) optimizer = AdamW(grouped_parameters, lr=config.lr) if config.apex: model, optimizer = apex.amp.initialize(model, optimizer, opt_level="O1", verbosity=0) updates_per_epoch = len(train_ds) // config.batch_size num_updates = int(config.epochs * updates_per_epoch) scheduler = WarmupLinearSchedule(warmup=config.warmup, t_total=num_updates) # training loop smooth = 0.1 best_dice = 0.0 best_fn = None global_step = 0 for epoch in range(1, config.epochs + 1): smooth_loss = None smooth_accuracy = None model.train() train_loader = data.DataLoader(train_ds, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=config.pin, drop_last=True) progress = tqdm(total=len(train_ds), smoothing=0.01) for i, (X, _, y_true) in enumerate(train_loader): X = X.to(config.device).float() y_true = y_true.to(config.device) y_pred = model(X) loss = siim_loss(y_true, y_pred, weights=None) if config.apex: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() lr_this_step = None if (i + 1) % config.accumulation_step == 0: optimizer.step() optimizer.zero_grad() lr_this_step = config.lr * scheduler.get_lr(global_step, config.warmup) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step global_step += 1 smooth_loss = loss.item() if smooth_loss is None else \ smooth * loss.item() + (1. - smooth) * smooth_loss # print((y_true >= 0.5).sum().item()) accuracy = th.mean(((y_pred >= 0.5) == (y_true == 1)).to( th.float)).item() smooth_accuracy = accuracy if smooth_accuracy is None else \ smooth * accuracy + (1. - smooth) * smooth_accuracy progress.set_postfix(ep='%d/%d' % (epoch, config.epochs), loss='%.4f' % smooth_loss, accuracy='%.4f' % (smooth_accuracy), lr='%.6f' % (config.lr if lr_this_step is None else lr_this_step)) progress.update(len(X)) if epoch <= 12: continue # validation loop model.eval() thresholds = [0.1, 0.2] dice_coeffs = [[] for _ in range(len(thresholds))] progress = tqdm(enumerate(val_loader), total=len(val_loader)) with th.no_grad(): for i, (X, _, y_trues) in progress: X = X.to(config.device).float() y_trues = y_trues.to(config.device) y_preds = model(X) y_preds_flip = th.flip(model(th.flip(X, (-1, ))), (-1, )) y_preds = 0.5 * (y_preds + y_preds_flip) y_trues = y_trues.cpu().numpy() y_preds = y_preds.cpu().numpy() for yt, yp in zip(y_trues, y_preds): yt = (yt.squeeze() >= 0.5).astype('uint8') yp = yp.squeeze() for dind, threshold in enumerate(thresholds): yp_ = (yp >= threshold).astype(np.uint8) sc = score(yt, yp_) dice_coeffs[dind].append(sc) best_threshold_ind = -1 dice_coeff = -1 for dind, threshold in enumerate(thresholds): dc = np.mean([x[0] for x in dice_coeffs[dind] if x[1] == 'non-empty']) # progress.write("Dice @%.2f: %.4f" % (threshold, dc)) if dc > dice_coeff: dice_coeff = dc best_threshold_ind = dind dice_coeffs = dice_coeffs[best_threshold_ind] num_empty = sum(1 for x in dice_coeffs if x[1] == 'empty') num_total = len(dice_coeffs) num_non_empty = num_total - num_empty empty_sum = np.sum([d[0] for d in dice_coeffs if d[1] == 'empty']) non_empty_sum = np.sum([d[0] for d in dice_coeffs if d[1] == 'non-empty']) dice_coeff_empty = empty_sum / num_empty dice_coeff_non_empty = non_empty_sum / num_non_empty progress.write('[Empty: %d]: %.3f | %.3f, [Non-Empty: %d]: %.3f | %.3f' % ( num_empty, dice_coeff_empty, empty_sum / num_total, num_non_empty, dice_coeff_non_empty, non_empty_sum / num_total)) dice_coeff = float(dice_coeff) summary_str = 'f%02d-ep-%04d-val_dice-%.4f@%.2f' % (config.fold, epoch, dice_coeff, thresholds[best_threshold_ind]) progress.write(summary_str) if dice_coeff > best_dice: weight_fn = os.path.join(config.logdir, summary_str + '.pth') th.save(model.state_dict(), weight_fn) best_dice = dice_coeff best_fn = weight_fn fns = sorted(glob(os.path.join(config.logdir, 'f%02d-*.pth' % config.fold))) for fn in fns[:-config.n_keep]: os.remove(fn) # create submission test_ds = DicomDataset(test_image_fns) test_loader = data.DataLoader(test_ds, batch_size=config.batch_size, shuffle=False, num_workers=0, pin_memory=False, drop_last=False) if best_fn is not None: model.load_state_dict(th.load(best_fn)) model.eval() sub = create_submission(model, test_loader, config, pred_zip=config.pred_zip) sub.to_csv(config.submission_fn, index=False) print("Wrote to: %s" % config.submission_fn) # create val submission val_fn = config.submission_fn.replace('.csv', '_VAL.csv') model.eval() sub = [] sub = create_submission(model, val_loader, config, pred_zip=config.pred_zip.replace('.zip', '_VAL.zip')) sub.to_csv(val_fn, index=False) print("Wrote to: %s" % val_fn)
image_id = os.path.splitext(png)[0] p_ensemble = 0.0 for handle in handels: with handle.open(png) as f: img = cv2.imdecode(np.frombuffer(f.read(), 'uint8'), 0) p = np.float32(img) / 255 p_ensemble += p / len(fns) # classify p_reduced = reduction_fn(p_ensemble) if p_reduced < p_clf: image_ids.append(image_id) rles.append('-1') num_clf_empty += 1 continue # segment mask = (p_ensemble > p_seg).astype('uint8') if mask.sum() == 0: image_ids.append(image_id) rles.append('-1') continue image_ids.append(image_id) rles.append(DicomDataset.mask_to_rle(mask * 255)) sub = pd.DataFrame({'ImageId': image_ids, 'EncodedPixels': rles}) sub.to_csv(out_fn, columns=['ImageId', 'EncodedPixels'], index=False) print("Wrote to: %s" % out_fn) print("Empty (clf): ", num_clf_empty)
train_image_fns_PA, train_image_fns_AP, test_image_fns_PA, test_image_fns_AP ], f) else: with open(cache_fn, 'rb') as f: train_image_fns_PA, train_image_fns_AP, test_image_fns_PA, \ test_image_fns_AP = pickle.load(f) gt = pd.read_csv('train-rle.csv') # merge multi-rle gt = gt.groupby('ImageId', as_index=False).agg(lambda x: x.iloc[0]) train_image_fns_PA, train_image_fns_AP, test_image_fns_PA, \ test_image_fns_AP = map(set, (train_image_fns_PA, train_image_fns_AP, test_image_fns_PA, test_image_fns_AP)) train_image_ids_PA, train_image_ids_AP, test_image_ids_PA, \ test_image_ids_AP = map(lambda x: [DicomDataset.fn_to_id(ele) for ele in x], (train_image_fns_PA, train_image_fns_AP, test_image_fns_PA, test_image_fns_AP)) gt['View'] = gt['ImageId'].apply(lambda x: 'PA' if x in train_image_ids_PA else 'AP') print(gt.groupby('View')[' EncodedPixels'].agg(lambda x: (x == ' -1').mean())) print("MASK SAMPLES: ", gt.groupby('View')[' EncodedPixels'].agg(lambda x: (x != ' -1').sum())) print("TRAIN: ", gt.groupby('View')[' EncodedPixels'].agg(lambda x: len(x))) print("TEST PA: %d, AP: %d" % (len(test_image_fns_PA), len(test_image_fns_AP))) print("Train Fraction: %.3f" % (len(train_image_ids_AP) / len(train_image_ids_PA))) print("Test Fraction: %.3f" %
with open('current_ptx_hashes.p', 'rb') as f: hh = pickle.load(f) preds = [pd.read_csv(fn) for fn in oof_fns] gts = load_gt('train-rle.csv') np.random.seed(123) pred = pd.concat(preds) pred = {k: v for k, v in zip(pred['ImageId'], pred['EncodedPixels'])} train_image_fns = sorted(glob(os.path.join('dicom-images-train', '*/*/*.dcm'))) np.random.shuffle(train_image_fns) num_fp, num_fp_in_ptx, num_fp_not_in_ptx = 0, 0, 0 image_ids, rles = [], [] num_missing = 0 for ind, fn in tqdm(enumerate(train_image_fns), total=len(train_image_fns)): img_id = DicomDataset.fn_to_id(fn) try: p = pred[img_id] except: num_missing += 1 print(img_id) continue gt = gts[img_id] image_ids.append(img_id) rles.append(p) if len(gt) == 0: continue if '-1' not in p and '-1' in gt[0]: num_fp += 1 is_in = False if hh[ind] in ptx_hashes:
import pandas as pd import cv2 from matplotlib import pyplot as plt # noqa from data import DicomDataset fn = 'submission.csv' splitted_fn = 'SPLIT_' + fn sub = pd.read_csv(fn) img_ids, rles = [], [] for img_id, rle in zip(sub['ImageId'], sub['EncodedPixels']): if '-1' in rle: img_ids.append(img_id) rles.append(rle) continue mask = DicomDataset.rles_to_mask([rle]) _, labels = cv2.connectedComponents(mask.astype('uint8'), connectivity=8) num_pred_masks = labels.max() for label_index in range(1, num_pred_masks + 1): m = (labels == label_index) rle = DicomDataset.mask_to_rle(m * 255) img_ids.append(img_id) rles.append(rle) splitted_sub = pd.DataFrame({'ImageId': img_ids, 'EncodedPixels': rles}) splitted_sub.to_csv(splitted_fn, index=False, columns=['ImageId', 'EncodedPixels']) print("Wrote to: %s" % splitted_fn)
def compute_score(key): yt = DicomDataset.rles_to_mask(gt[key], merge_masks=True) yp = DicomDataset.rles_to_mask(submission[key], merge_masks=True) return score(yt, yp)
'ADJUST_V2_' + os.path.basename(sub_fn)) sub = load_gt(sub_fn, rle_key='EncodedPixels') val = '_VAL' in sub_fn if val: ids = load_mask_counts('train-rle.csv') else: ids = load_mask_counts('sample_submission.csv') adjusted_sub = {'ImageId': [], 'EncodedPixels': []} num_removed = 0 num_added = 0 num_missed = 0 for image_id in tqdm(sub): rles = sub[image_id] num_masks = ids.get(image_id, 1) masks = DicomDataset.rles_to_mask(rles, merge_masks=False) num_pred = masks.max() if num_pred > num_masks: sizes = np.float32([(masks == i).sum() for i in range(1, num_pred + 1)]) inds = np.argsort(-sizes)[:num_masks] inds = [range(1, num_pred + 1)[ind] for ind in inds] num_removed += len(sizes) - len(inds) rles = [] for ind in inds: rles.append( DicomDataset.mask_to_rle((masks == ind).astype('uint8') * 255)) elif num_masks > num_pred and num_masks >= 2: if num_pred >= 1: sizes = np.float32([(masks == i).sum()
ptx_hashes = {v: k for k, v in nih.items()} num_found = 0 sub_fn = 'x_ensemble/CLF_ADJUST_Both_ENS_V3_0049.csv' sub = pd.read_csv(sub_fn) sub = {k: v for k, v in zip(sub['ImageId'], sub['EncodedPixels'])} offset = len(train_image_fns) num_match = 0 num_not_in_nih_ptx = 0 num_ptx = 0 out_fn = sub_fn.replace('_V3', '_V4') nih_ids, nih_rles = [], [] for k, h in tqdm(enumerate(hh[offset:]), total=len(hh[offset:])): fn = all_fns[k + offset] img_id = DicomDataset.fn_to_id(fn) p = sub[img_id] nih_ids.append(img_id) if h in ptx_hashes: nih_rles.append(p) num_ptx += 1 if '-1' not in p: num_match += 1 print("%s -> %s" % (fn, ptx_hashes[h])) if plot: img = pydicom.read_file(fn).pixel_array plt.imshow(img, cmap='bone') plt.show() else: if '-1' not in p:
def main(): parser = argparse.ArgumentParser() parser.add_argument('--fn', type=str) parser.add_argument('--show-empty', action='store_true') parser.add_argument('--seed', type=int, default=32) parser.add_argument('--height', type=int, default=1024) args = parser.parse_args() sub = pd.read_csv(args.fn) np.random.seed(args.seed) if ' EncodedPixels' in sub.columns: sub['EncodedPixels'] = sub[' EncodedPixels'] sub = sub[['ImageId', 'EncodedPixels']] sub['EncodedPixels'] = sub['EncodedPixels'].apply(lambda x: x if x != ' -1' else '-1') gt = load_gt('train-rle.csv') pred_gt = load_gt('sub_8730.csv', rle_key='EncodedPixels') for k, v in pred_gt.items(): gt[k] = v train_fns = sorted(glob('dicom-images-train/*/*/*.dcm')) test_fns = sorted(glob('dicom-images-test/*/*/*.dcm')) all_fns = train_fns + test_fns id_to_fn = {DicomDataset.fn_to_id(fn): fn for fn in all_fns} sub_ = defaultdict(list) for iid, rle in zip(sub['ImageId'], sub['EncodedPixels']): sub_[iid].append(rle) sub = sub_ num_mask = sum(1 for k, v in sub.items() if v[0] != '-1') num_one_mask = sum(1 for k, v in sub.items() if v[0] != '-1' and len(v) == 1) num_more_mask = sum(1 for k, v in sub.items() if v[0] != '-1' and len(v) >= 2) print("%d of %d have a mask" % (num_mask, len(sub))) print("%d have 1, %d 2 or more" % (num_one_mask, num_more_mask)) img_ids = sorted(sub.keys()) np.random.shuffle(img_ids) for img_id in img_ids: img_fn = id_to_fn[img_id] rles = sub[img_id] if not args.show_empty: if rles[0] == '-1': continue print("%d masks" % len(rles)) dcm = pydicom.dcmread(img_fn) view = dcm.ViewPosition print(view) img = dcm.pixel_array mask = DicomDataset.rles_to_mask(rles, merge_masks=False) if args.height != 1024: img = cv2.resize(img, (args.height, args.height), interpolation=cv2.INTER_NEAREST) mask = cv2.resize(mask, (args.height, args.height), interpolation=cv2.INTER_NEAREST) gt_mask = None if img_id in gt: gt_rles = gt[img_id] gt_mask = DicomDataset.rles_to_mask(gt_rles, merge_masks=False) gt_mask = cv2.resize(gt_mask, (args.height, args.height), interpolation=cv2.INTER_NEAREST) if gt_mask.max() == 0: continue # for j in range(0, 512, 16): # img[:, j] = 255 # img[j, :] = 255 # mask[:, j] = mask.max() # mask[j, :] = mask.max() nc = 2 if gt_mask is None else 3 plt.subplot(1, nc, 1) plt.title(os.path.splitext(img_id)[-1]) plt.imshow(img, cmap='bone') plt.axis('off') plt.subplot(1, nc, 2) plt.title('PRED: ' + str(mask.max())) plt.imshow(mask, cmap='bone', alpha=0.4) plt.axis('off') if gt_mask is not None: vis = np.dstack([img.copy()] * 3) vis[gt_mask > 0] = (0, 255, 0) vis[mask > 0] = 0.3 * vis[mask > 0] + 0.7 * np.float32([255, 0, 0]) plt.subplot(1, nc, 3) plt.title('%sGT: ' % ('PRED-' if img_id in pred_gt else 'GT-') + str(gt_mask.max())) plt.imshow(vis, cmap='bone') plt.axis('off') plt.show()
mask = cv2.resize(mask, (1024, 1024), interpolation=cv2.INTER_NEAREST) if passed: current_p_seg = p_seg while mask.sum() == 0 and current_p_seg > 0: current_p_seg -= 0.1 print("Lowering p_seg to: %.2f" % current_p_seg) mask = (p_ensemble > current_p_seg).astype('uint8') if current_p_seg <= 0: still_not_passed += 1 print("Still not passed ... ") if mask.sum() == 0: image_ids.append(image_id) rles.append('-1') continue labels = cv2.connectedComponents(mask, connectivity=8)[1] num_pred_masks = labels.max() for ind in range(1, num_pred_masks + 1): m = (labels == ind).astype('uint8') rle = DicomDataset.mask_to_rle(m * 255) image_ids.append(image_id) rles.append(rle) sub = pd.DataFrame({'ImageId': image_ids, 'EncodedPixels': rles}) sub.to_csv(out_fn, columns=['ImageId', 'EncodedPixels'], index=False) print("Wrote to: %s" % out_fn) if clf_adjust: print("%d (could) have passed classifier, %d did" % (num_passed, num_passed - still_not_passed))