def main(): parser = argparse.ArgumentParser() parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() data_dir = args.data_dir cover = fs.find_images_in_dir(os.path.join(data_dir, "Cover")) jimi = fs.find_images_in_dir(os.path.join(data_dir, "JMiPOD")) juni = fs.find_images_in_dir(os.path.join(data_dir, "JUNIWARD")) uerd = fs.find_images_in_dir(os.path.join(data_dir, "UERD")) for cover_fname, jimi_fname, juni_fname, uerd_fname in zip( tqdm(cover), jimi, juni, uerd): cover = decode_bgr_from_dct(fs.change_extension(cover_fname, ".npz")) jimi = decode_bgr_from_dct(fs.change_extension(jimi_fname, ".npz")) juni = decode_bgr_from_dct(fs.change_extension(juni_fname, ".npz")) uerd = decode_bgr_from_dct(fs.change_extension(uerd_fname, ".npz")) jimi_mask = block8_sum(np.abs(cover - jimi).sum(axis=2)) > 0 juni_mask = block8_sum(np.abs(cover - juni).sum(axis=2)) > 0 uerd_mask = block8_sum(np.abs(cover - uerd).sum(axis=2)) > 0 cover_mask = jimi_mask | juni_mask | uerd_mask cv2.imwrite(fs.change_extension(cover_fname, ".png"), cover_mask * 255) cv2.imwrite(fs.change_extension(jimi_fname, ".png"), jimi_mask * 255) cv2.imwrite(fs.change_extension(juni_fname, ".png"), juni_mask * 255) cv2.imwrite(fs.change_extension(uerd_fname, ".png"), uerd_mask * 255)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() data_dir = args.data_dir cover = os.path.join(data_dir, "Cover") JMiPOD = os.path.join(data_dir, "JMiPOD") JUNIWARD = os.path.join(data_dir, "JUNIWARD") UERD = os.path.join(data_dir, "UERD") dataset = (fs.find_images_in_dir(cover) + fs.find_images_in_dir(JMiPOD) + fs.find_images_in_dir(JUNIWARD) + fs.find_images_in_dir(UERD)) # dataset = dataset[:500] mean, std = compute_mean_std(tqdm(dataset)) print(mean.size()) print(std.size()) print( "Mean", np.array2string(to_numpy(mean), precision=2, separator=",", max_line_width=119)) print( "Std ", np.array2string(to_numpy(std), precision=2, separator=",", max_line_width=119))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() data_dir = args.data_dir original_images = np.array( fs.find_images_in_dir(os.path.join(data_dir, "Cover"))) image_sizes = np.array( [os.stat(fname).st_size for fname in original_images]) order = np.argsort(image_sizes) original_images = original_images[order] num_folds = 4 num_images = len(original_images) folds_lut = (list(range(num_folds)) * num_images)[:num_images] folds_lut = np.array(folds_lut) df = defaultdict(list) df[INPUT_IMAGE_ID_KEY].extend( [os.path.basename(x) for x in original_images]) df[INPUT_FOLD_KEY].extend(folds_lut) df = pd.DataFrame.from_dict(df) df.to_csv("folds.csv", index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() data_dir = args.data_dir test_dir = os.path.join(data_dir, "Test") dataset = fs.find_images_in_dir(test_dir) # dataset = dataset[:500] df = defaultdict(list) for image_fname in tqdm(dataset): dct_fname = fs.change_extension(image_fname, ".npz") dct_data = np.load(dct_fname) qm0 = dct_data["qm0"] qm1 = dct_data["qm1"] qf = quality_factror_from_qm(qm0) fsize = os.stat(image_fname).st_size df["image_id"].append(os.path.basename(image_fname)) df["quality"].append(qf) df["qm0"].append(qm0.flatten().tolist()) df["qm1"].append(qm1.flatten().tolist()) df["file_size"].append(fsize) df = pd.DataFrame.from_dict(df) df.to_csv("test_dataset_qf_qt.csv", index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() data_dir = args.data_dir cover_images = [ x for x in fs.find_images_in_dir(os.path.join(data_dir, "Cover")) if str.endswith(x, ".jpg") ] # cover_images = cover_images[:100] pool = Pool(6) results_df = [] for df in tqdm(pool.imap(compute_statistics, cover_images), total=len(cover_images)): results_df.append(df) results_df = pd.concat([pd.DataFrame.from_dict(x) for x in results_df]) results_df.to_csv("analyze_embeddings.csv", index=False)
def convert_dir(input_dir, output_dir, image_size=768, workers=32): os.makedirs(output_dir, exist_ok=True) images = fs.find_images_in_dir(input_dir) processing_fn = partial(preprocess, output_dir=output_dir, image_size=image_size) with Pool(workers) as wp: for image_id in tqdm(wp.imap_unordered(processing_fn, images), total=len(images)): pass
def main(): images_dir = 'c:\\datasets\\ILSVRC2013_DET_val' canny_cnn = maybe_cuda(CannyModel()) optimizer = Adam(canny_cnn.parameters(), lr=1e-4) images = find_images_in_dir(images_dir) train_images, valid_images = train_test_split(images, test_size=0.1, random_state=1234) num_workers = 6 num_epochs = 100 batch_size = 16 if False: train_images = train_images[:batch_size * 4] valid_images = valid_images[:batch_size * 4] train_loader = DataLoader(EdgesDataset(train_images), batch_size=batch_size, num_workers=num_workers, shuffle=True, drop_last=True, pin_memory=True) valid_loader = DataLoader(EdgesDataset(valid_images), batch_size=batch_size, num_workers=num_workers, pin_memory=True) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 40], gamma=0.3) # model runner runner = SupervisedRunner() # checkpoint = UtilsFactory.load_checkpoint("logs/checkpoints//best.pth") # UtilsFactory.unpack_checkpoint(checkpoint, model=canny_cnn) # model training runner.train( model=canny_cnn, criterion=FocalLoss(), optimizer=optimizer, scheduler=scheduler, callbacks=[ JaccardCallback(), ShowPolarBatchesCallback(visualize_canny_predictions, metric='jaccard', minimize=False), EarlyStoppingCallback(patience=5, min_delta=0.01, metric='jaccard', minimize=False), ], loaders=loaders, logdir='logs', num_epochs=num_epochs, verbose=True, main_metric='jaccard', minimize_metric=False # check=True )
def cut_test_dataset_in_patches(data_dir, tile_size, tile_step, image_margin): train_imgs = fs.find_images_in_dir(os.path.join(data_dir, "test", "images")) images_dir = os.path.join(data_dir, "test_tiles", "images") df = defaultdict(list) for train_img in tqdm(train_imgs, total=len(train_imgs), desc="test_imgs"): img_tiles = split_image(train_img, images_dir, tile_size, tile_step, image_margin) df["image"].extend(img_tiles) df["image_id"].extend([fs.id_from_fname(train_img)] * len(img_tiles)) return pd.DataFrame.from_dict(df)
def get_pseudolabeling_dataset(data_dir: str, include_masks: bool, image_size=(224, 224), augmentation=None, need_weight_mask=False): images = fs.find_images_in_dir( os.path.join(data_dir, "test_tiles", "images")) masks_dir = os.path.join(data_dir, "test_tiles", "masks") os.makedirs(masks_dir, exist_ok=True) masks = [ os.path.join(masks_dir, fs.id_from_fname(image_fname) + ".png") for image_fname in images ] if augmentation == "hard": transfrom = A.Compose( [crop_transform(image_size, input_size=768), hard_augmentations()]) elif augmentation == "medium": transfrom = A.Compose([ crop_transform(image_size, input_size=768), medium_augmentations() ]) elif augmentation == "light": transfrom = A.Compose([ crop_transform(image_size, input_size=768), light_augmentations() ]) else: transfrom = A.Normalize() return InriaImageMaskDataset( images, masks if include_masks else None, transform=transfrom, image_loader=read_inria_image, mask_loader=read_inria_mask_with_pseudolabel, need_weight_mask=need_weight_mask, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) parser.add_argument("-od", "--output-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) parser.add_argument("-f", "--folder", type=str, default=None) parser.add_argument("-p", "--part", type=int, default=None) args = parser.parse_args() data_dir = args.data_dir if args.folder is None: original_images = ( fs.find_images_in_dir(os.path.join(data_dir, "Cover")) + fs.find_images_in_dir(os.path.join(data_dir, "JMiPOD")) + fs.find_images_in_dir(os.path.join(data_dir, "JUNIWARD")) + fs.find_images_in_dir(os.path.join(data_dir, "UERD")) + fs.find_images_in_dir(os.path.join(data_dir, "Test")) ) else: original_images = fs.find_images_in_dir(os.path.join(data_dir, args.folder)) if args.part is not None: half = len(original_images) // 2 if args.part == 0: original_images = original_images[:half] print("First half") else: original_images = original_images[half:] print("Second half") print(original_images[0]) os.makedirs(args.output_dir, exist_ok=True) process_fn = partial(extract_and_save_dct_jpegio, output_dir=args.output_dir) with Pool(16) as wp: for _ in tqdm(wp.imap_unordered(process_fn, original_images), total=len(original_images)): pass
def get_xview2_extra_dataset( data_dir: str, image_size=(224, 224), augmentation="hard", need_weight_mask=False, fast=False) -> Tuple[Dataset, WeightedRandomSampler]: """ Create additional train dataset using xView2 dataset :param data_dir: xView2 dataset directory :param fast: Fast training model. Use only one image per location for training and one image per location for validation :param image_size: Size of image crops during training & validation :param need_weight_mask: If True, adds 'edge' target mask :param augmentation: Type of image augmentations to use 'random' - crops tiles from source images randomly. 'tiles' - crop image in overlapping tiles (guaranteed to process entire dataset) :return: (train_loader, valid_loader) """ if augmentation == "hard": train_transform = hard_augmentations() elif augmentation == "medium": train_transform = medium_augmentations() elif augmentation == "light": train_transform = light_augmentations() elif augmentation == "safe": train_transform = safe_augmentations() else: train_transform = [] def is_pre_image(fname): return "_pre_" in fname train1_img = list( filter( is_pre_image, fs.find_images_in_dir(os.path.join(data_dir, "train", "images")))) train1_msk = list( filter(is_pre_image, fs.find_images_in_dir(os.path.join(data_dir, "train", "masks")))) train2_img = list( filter( is_pre_image, fs.find_images_in_dir(os.path.join(data_dir, "tier3", "images")))) train2_msk = list( filter(is_pre_image, fs.find_images_in_dir(os.path.join(data_dir, "tier3", "masks")))) if fast: train1_img = train1_img[:128] train1_msk = train1_msk[:128] train2_img = train2_img[:128] train2_msk = train2_msk[:128] train_transform = A.Compose( [crop_transform_xview2(image_size, input_size=1024), train_transform]) trainset = InriaImageMaskDataset( image_filenames=train1_img + train2_img, mask_filenames=train1_msk + train2_msk, transform=train_transform, mask_loader=read_xview_mask, need_weight_mask=need_weight_mask, ) num_train_samples = int( len(trainset) * (1024 * 1024) / (image_size[0] * image_size[1])) crops_in_image = (1024 * 1024) / (image_size[0] * image_size[1]) if fast: num_train_samples = 128 train_sampler = WeightedRandomSampler( torch.ones(len(trainset)) * crops_in_image, num_train_samples) return trainset, None if fast else train_sampler
def get_datasets( data_dir='data', image_size=(512, 512), augmentation='medium', preprocessing=None, use_aptos2019=True, use_aptos2019_test_pl1=False, use_aptos2015_pl1=False, use_aptos2015=False, use_aptos2015_test_private=False, use_idrid=False, use_messidor=False, use_messidor2_pl1=False, use_unsupervised=False, target_dtype=int, random_state=42, coarse_grading=False, fold=None, folds=4) -> Tuple[RetinopathyDataset, RetinopathyDataset, List]: assert use_aptos2019 or use_aptos2015 or use_aptos2015_test_private or use_idrid or use_messidor assert not (use_aptos2015 and use_aptos2015_pl1) trainset_sizes = [] data_split = [], [], [], [] aptos2019_dir = os.path.join(data_dir, 'aptos-2019') aptos2015_dir = os.path.join(data_dir, 'aptos-2015') if use_aptos2019: x, y = get_aptos2019_train(aptos2019_dir) split = split_train_valid(x, y, fold=fold, folds=folds, random_state=random_state) data_split = append_train_test(data_split, split) trainset_sizes.append(split[0]) if use_aptos2015_pl1: # Add training data aptos2015_train_pseudolabel_round_1 = pd.read_csv( os.path.join(aptos2015_dir, 'aptos2015_train_pseudolabel_round_1.csv')) aptos2015_train_pseudolabel_round_1 = aptos2015_train_pseudolabel_round_1[ aptos2015_train_pseudolabel_round_1['diagnosis'] != -100] # x = np.array(aptos2015_train_pseudolabel_round_1['id_code'].apply( # lambda x: os.path.join(aptos2015_dir, 'train_images_768', f'{x}.png'))) x = np.array(aptos2015_train_pseudolabel_round_1['id_code'].apply( lambda x: os.path.join(aptos2015_dir, 'train_images_768', f'{x}.jpeg'))) y = np.array(aptos2015_train_pseudolabel_round_1['diagnosis'], dtype=int) # For training part of aptos2015 - add it conventionaly split = split_train_valid(x, y, fold=fold, folds=folds, random_state=random_state) data_split = append_train_test(data_split, split) trainset_sizes.append(split[0]) # For public test validation data add only unhealthy samples to train set aptos2015_test_public_pl1 = pd.read_csv( os.path.join(aptos2015_dir, 'aptos2015_test_public_pseudolabel_round_1.csv')) aptos2015_test_public_pl1 = aptos2015_test_public_pl1[ aptos2015_test_public_pl1['diagnosis'] != -100] # x = np.array(aptos2015_test_public_pl1['id_code'].apply( # lambda x: os.path.join(aptos2015_dir, 'test_images_768', f'{x}.png'))) x = np.array( aptos2015_test_public_pl1['id_code'].apply(lambda x: os.path.join( aptos2015_dir, 'test_images_768', f'{x}.jpeg'))) y = np.array(aptos2015_test_public_pl1['diagnosis'], dtype=int) # For pseudolabeled data, we add only one fold of it to clear training data # From test set add only unhealthy train_x, valid_x, train_y, valid_y = split_train_valid( x, y, fold=fold, folds=folds, random_state=random_state) train_x = train_x[train_y > 0] train_y = train_y[train_y > 0] split = train_x, valid_x, train_y, valid_y data_split = append_train_test(data_split, split) trainset_sizes.append(train_x[0]) # Add Aptos2015 private test to validation set entirely aptos2015_test_private_pl1 = pd.read_csv( os.path.join(aptos2015_dir, 'aptos2015_test_private_pseudolabel_round_1.csv')) aptos2015_test_private_pl1 = aptos2015_test_private_pl1[ aptos2015_test_private_pl1['diagnosis'] != -100] # x = np.array(aptos2015_test_private_pl1['id_code'].apply( # lambda x: os.path.join(aptos2015_dir, 'test_images_768', f'{x}.png'))) x = np.array( aptos2015_test_private_pl1['id_code'].apply(lambda x: os.path.join( aptos2015_dir, 'test_images_768', f'{x}.jpeg'))) y = np.array(aptos2015_test_private_pl1['diagnosis'], dtype=int) # From test set add only unhealthy x = x[y > 0] y = y[y > 0] data_split = append_train_test(data_split, ([], x, [], y)) if use_messidor2_pl1: messidor2_dir = os.path.join(data_dir, 'messidor_2') messidor2_pseudolabel_round_1 = pd.read_csv( os.path.join(messidor2_dir, 'train_labels_pseudolabel_round_1.csv')) confident_labels_mask = messidor2_pseudolabel_round_1[ 'diagnosis'] != -100 messidor2_pseudolabel_round_1 = messidor2_pseudolabel_round_1[ confident_labels_mask] x = np.array(messidor2_pseudolabel_round_1['id_code'].apply( lambda x: os.path.join(messidor2_dir, 'train_images_768', f'{x}.png'))) y = np.array(messidor2_pseudolabel_round_1['diagnosis'], dtype=int) split = split_train_valid(x, y, fold=fold, folds=folds, random_state=random_state) data_split = append_train_test(data_split, split) trainset_sizes.append(split[0]) if use_aptos2015: x, y = get_aptos2015_train(aptos2015_dir, healthy_eye_fraction=0.2) split = split_train_valid(x, y, fold=fold, folds=folds, random_state=random_state) data_split = append_train_test(data_split, split) trainset_sizes.append(split[0]) if use_aptos2015_test_private: x, y = get_aptos2015_test_private(aptos2015_dir, healthy_eye_fraction=0.2) split = split_train_valid(x, y, fold=fold, folds=folds, random_state=random_state) data_split = append_train_test(data_split, split) trainset_sizes.append(split[0]) if use_idrid: x, y = get_idrid_train(os.path.join(data_dir, 'idrid')) split = split_train_valid(x, y, fold=fold, folds=folds, random_state=random_state) data_split = append_train_test(data_split, split) trainset_sizes.append(split[0]) if use_messidor: x, y = get_messidor(os.path.join(data_dir, 'messidor'), include_grade_3=False) split = split_train_valid(x, y, fold=fold, folds=folds, random_state=random_state) data_split = append_train_test(data_split, split) trainset_sizes.append(split[0]) train_x, train_y, valid_x, valid_y = data_split if use_idrid: # Regardless of used datasets let's use some data from validation (holdout) data_idrid_test = get_idrid_test(os.path.join(data_dir, 'idrid')) valid_x.extend(data_idrid_test[0]) valid_y.extend(data_idrid_test[1]) if use_aptos2015: data_aptos15_public = get_aptos2015_test_public( aptos2015_dir, healthy_eye_fraction=0.1) valid_x.extend(data_aptos15_public[0]) valid_y.extend(data_aptos15_public[1]) train_transform = get_train_transform(image_size, augmentation=augmentation, preprocessing=preprocessing, crop_black=False) valid_transform = get_test_transform(image_size, preprocessing=preprocessing, crop_black=False) if coarse_grading: assert not use_unsupervised coarse_grading_map = np.array([0, 1, 1, 1, 2]) train_y = coarse_grading_map[np.array(train_y)] valid_y = coarse_grading_map[np.array(valid_y)] print('Train', count_targets(train_y), "Valid", count_targets(valid_y)) if use_unsupervised: aptos2019, _ = get_aptos2019_test(aptos2019_dir) print('Adding', len(aptos2019), 'unlabeled samples from aptos2019 (test)') diaretdb0_v_1_1 = fs.find_images_in_dir( os.path.join(data_dir, 'diaretdb0_v_1_1', 'train_images_768')) print('Adding', len(diaretdb0_v_1_1), 'unlabeled samples from diaretdb0_v_1_1') diaretdb1_v_1_1 = fs.find_images_in_dir( os.path.join(data_dir, 'diaretdb1_v_1_1', 'train_images_768')) print('Adding', len(diaretdb1_v_1_1), 'unlabeled samples from diaretdb1_v_1_1') origa1 = fs.find_images_in_dir( os.path.join(data_dir, 'origa', 'glaucoma_768')) print('Adding', len(origa1), 'unlabeled samples from origa1') origa2 = fs.find_images_in_dir( os.path.join(data_dir, 'origa', 'sanas_768')) print('Adding', len(origa2), 'unlabeled samples from origa2') stare = fs.find_images_in_dir( os.path.join(data_dir, 'stare', 'train_images_768')) print('Adding', len(stare), 'unlabeled samples from stare') unlabeled_samples = diaretdb0_v_1_1 + diaretdb1_v_1_1 + stare + origa1 + origa2 + aptos2019.tolist( ) if not use_messidor: messidor = fs.find_images_in_dir( os.path.join(data_dir, 'messidor', 'train_images_768')) unlabeled_samples += messidor print('Adding', len(messidor), 'unlabeled samples from Messidor') if not use_aptos2015: dataset_dir = os.path.join(data_dir, 'aptos-2015') x, y = get_aptos2015_train(dataset_dir, healthy_eye_fraction=0.1) unlabeled_samples += x.tolist() print('Adding', len(x), 'unlabeled samples from Aptos 2015') if not use_aptos2015_test_private: dataset_dir = os.path.join(data_dir, 'aptos-2015') x, y = get_aptos2015_test_private(dataset_dir, healthy_eye_fraction=0.1) unlabeled_samples += x.tolist() print('Adding', len(x), 'unlabeled samples from Aptos 2015 Test (Private)') unlabeled_targets = [UNLABELED_CLASS] * len(unlabeled_samples) print('Using', len(unlabeled_samples), 'unlabeled samples') train_x.extend(unlabeled_samples) train_y.extend(unlabeled_targets) train_ds = RetinopathyDatasetV2(train_x, train_y, transform=train_transform, normalize=valid_transform, dtype=target_dtype) trainset_sizes.append(len(unlabeled_samples)) else: train_ds = RetinopathyDataset(train_x, train_y, transform=train_transform, dtype=target_dtype) valid_ds = RetinopathyDataset(valid_x, valid_y, transform=valid_transform, dtype=target_dtype) return train_ds, valid_ds, trainset_sizes
def main(): dataset = fs.find_images_in_dir("/home/bloodaxe/datasets/ALASKA2/Cover") dataset = dataset[:500] print("YCbCr", compute_mean_std(tqdm(dataset)))
import numpy as np import cv2 from pytorch_toolbelt.utils import fs from tqdm import tqdm def compute_mean_std(dataset, read_image=cv2.imread): """ https://stats.stackexchange.com/questions/25848/how-to-sum-a-standard-deviation """ one_over_255 = float(1.0 / 255.0) global_mean = np.zeros(3, dtype=np.float64) global_var = np.zeros(3, dtype=np.float64) n_items = 0 for image_fname in dataset: x = read_image(image_fname) * one_over_255 mean, stddev = cv2.meanStdDev(x) global_mean += np.squeeze(mean) global_var += np.squeeze(stddev)**2 n_items += 1 return global_mean / n_items, np.sqrt(global_var / n_items) dataset = fs.find_images_in_dir("d:\\datasets\\ALASKA2\\Cover") print(compute_mean_std(tqdm(dataset)))