def do_camelyon16(): """ camelyon16. The train/valid/test sets are already provided. :return: """ # =============== # Reproducibility # =============== # =========================== reproducibility.init_seed() # =========================== ds = constants.CAM16 announce_msg("Processing dataset: {}".format(ds)) args = { "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset': ds})), "dataset": ds, "fold_folder": "folds/{}".format(ds), "img_extension": "jpg", "path_encoding": "folds/{}/encoding-origine.yaml".format(ds) } # Convert masks into binary masks: already done. # create_bin_mask_Oxford_flowers_102(Dict2Obj(args)) reproducibility.init_seed() al_split_camelyon16(Dict2Obj(args))
def do_Oxford_flowers_102(): """ Oxford-flowers-102. The train/valid/test sets are already provided. :return: """ # =============== # Reproducibility # =============== # =========================== reproducibility.init_seed() # =========================== announce_msg("Processing dataset: {}".format(constants.OXF)) args = { "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset': constants.OXF})), "dataset": "Oxford-flowers-102", "fold_folder": "folds/Oxford-flowers-102", "img_extension": "jpg", "path_encoding": "folds/Oxford-flowers-102/encoding-origine.yaml" } # Convert masks into binary masks: already done. # create_bin_mask_Oxford_flowers_102(Dict2Obj(args)) reproducibility.init_seed() al_split_Oxford_flowers_102(Dict2Obj(args)) get_stats(Dict2Obj(args), split=0, fold=0, subset='train')
def do_glas(): """ GlaS. :return: """ # =============== # Reproducibility # =============== reproducibility.init_seed() announce_msg("Processing dataset: {}".format(constants.GLAS)) args = { "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset': constants.GLAS})), "folding": { "vl": 20 }, # 80 % for train, 20% for validation. "dataset": "glas", "fold_folder": "folds/glas", "img_extension": "bmp", # nbr_splits: how many times to perform the k-folds over # the available train samples. "nbr_splits": 1 } args["nbr_folds"] = math.ceil(100. / args["folding"]["vl"]) reproducibility.init_seed() al_split_glas(Dict2Obj(args)) get_stats(Dict2Obj(args), split=0, fold=0, subset='train')
def get_stats(args, split, fold, subset): """ Get some stats on the image sizes of specific dataset, split, fold. """ if not os.path.isdir(args.fold_folder): os.makedirs(args.fold_folder) tag = "ds-{}-s-{}-f-{}-subset-{}".format(args.dataset, split, fold, subset) log = open(join(args.fold_folder, "log-stats-ds-{}.txt".format(tag)), 'w') announce_msg("Going to check {}".format(args.dataset.upper())) relative_fold_path = join(args.fold_folder, "split_{}".format(split), "fold_{}".format(fold)) subset_csv = join(relative_fold_path, "{}_s_{}_f_{}.csv".format(subset, split, fold)) rootpath = get_rootpath_2_dataset(args) samples = csv_loader(subset_csv, rootpath) lh, lw = [], [] for el in samples: img = Image.open(el[1], 'r').convert('RGB') w, h = img.size lh.append(h) lw.append(w) msg = "min h {}, \t max h {}".format(min(lh), max(lh)) show_msg(msg, log) msg = "min w {}, \t max w {}".format(min(lw), max(lw)) show_msg(msg, log) fig, axes = plt.subplots(nrows=1, ncols=2) axes[0].hist(lh) axes[0].set_title('Heights') axes[1].hist(lw) axes[1].set_title('Widths') fig.tight_layout() plt.savefig(join(args.fold_folder, "size-stats-{}.png".format(tag))) log.close()
def clear_rootpath(samples, args): """ Remove the rootpath from the samples (img, mask) to be host independent. RETURNS A COPY OF THE SAMPLES UPDATED. :param samples: list of samples where each sample is a list. See format of samples for datasets. # 0. id: float, a unique id of the sample within the entire dataset. # 1. path_img: str, path to the image. # 2. path_mask: str or None, path to the mask if there is any. # Otherwise, None. # 3. label: int, the class label of the sample. # 4. tag: int in {0, 1, 2} where: 0: the samples belongs to the # supervised set (L). 1: The sample belongs to the unsupervised set # (U). 2: The sample belongs to the set of newly labeled samples ( # L'). This sample came from U and was labeling following a specific # mechanism. :param args: object. args of main.py. :return: a COPY list of samples (not inplace modification) """ spls = deepcopy(samples) rootpath = get_rootpath_2_dataset(args) for i, sm in enumerate(spls): img_pth = sm[1] mask_pth = sm[2] l_pths = [img_pth, mask_pth] for j, pth in enumerate(l_pths): if pth: # not None. pth = pth.replace(rootpath, '') if pth.startswith(os.path.sep): # remove / pth = pth[1:] l_pths[j] = pth img_pth, mask_pth = l_pths spls[i][1] = img_pth spls[i][2] = mask_pth return spls
def do_Caltech_UCSD_Birds_200_2011(): """ Caltech-UCSD-Birds-200-2011. :return: """ # =============== # Reproducibility # =============== # =========================== reproducibility.init_seed() # =========================== announce_msg("Processing dataset: {}".format(constants.CUB)) args = { "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset': constants.CUB})), "folding": { "vl": 20 }, # 80 % for train, 20% for validation. "dataset": "Caltech-UCSD-Birds-200-2011", "fold_folder": "folds/Caltech-UCSD-Birds-200-2011", "img_extension": "bmp", "nbr_splits": 1, # how many times to perform the k-folds over # the available train samples. "path_encoding": "folds/Caltech-UCSD-Birds-200-2011/encoding-origine.yaml", "nbr_classes": None # Keep only 5 random classes. If you want # to use the entire dataset, set this to None. } args["nbr_folds"] = math.ceil(100. / args["folding"]["vl"]) reproducibility.init_seed() al_split_Caltech_UCSD_Birds_200_2011(Dict2Obj(args)) get_stats(Dict2Obj(args), split=0, fold=0, subset='train')
"valid_s_" + str(args.split) + "_f_" + str(args.fold) + ".csv") test_csv = join( relative_fold_path, "test_s_" + str(args.split) + "_f_" + str(args.fold) + ".csv") # Check if the csv files exist. If not, raise an error. if not all([ os.path.isfile(train_csv), os.path.isfile(valid_csv), os.path.isfile(test_csv) ]): raise ValueError("Missing *.cvs files ({}[{}], {}[{}], {}[{}])".format( train_csv, os.path.isfile(train_csv), valid_csv, os.path.isfile(valid_csv), test_csv, os.path.isfile(test_csv))) rootpath = get_rootpath_2_dataset(args) train_samples = csv_loader(train_csv, rootpath) valid_samples = csv_loader(valid_csv, rootpath) test_samples = csv_loader(test_csv, rootpath) # Just for debug to go fast. if DEBUG_MODE and (args.dataset == "Caltech-UCSD-Birds-200-2011"): reproducibility.force_seed(int(os.environ["MYSEED"])) warnings.warn("YOU ARE IN DEBUG MODE!!!!") train_samples = random.sample(train_samples, 100) valid_samples = random.sample(valid_samples, 5) test_samples = test_samples[:20] reproducibility.force_seed(int(os.environ["MYSEED"])) if DEBUG_MODE and (args.dataset == "Oxford-flowers-102"):
def get_init_sup_samples(args, sampler, COMMON, train_samples, OUTD ): """ Get the initial full supervised data. :return: """ previous_pairs = dict() previous_errors = False # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2. # al_type != AL_WSL. cnd_drop_n = (args.dataset == constants.CAM16) cnd_drop_n &= (args.al_type != constants.AL_WSL) # round 0 cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL]) cnd &= (args.al_it == 0) if cnd: # deterministic function with respect to the original seed. set_default_seed() train_samples = sampler.sample_init_random_samples(train_samples) set_default_seed() # store on disc: remove the rootpath from files to be host-independent. # store relative paths not absolute. base_f = 'train_{}.csv'.format(args.al_it) al_outf = join(COMMON, base_f) csv_writer(clear_rootpath(train_samples, args), al_outf ) shutil.copyfile(al_outf, join(OUTD, base_f)) # round > 0: combine all the samples of the previous al rounds # and the selected samples for this round. cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL]) cnd &= (args.al_it > 0) if cnd: # 'train_{i}.csv' contains the selected samples at round i. lfiles = [join( COMMON, 'train_{}.csv'.format(t)) for t in range(args.al_it + 1)] if (args.al_type == constants.AL_LP) and (args.task == constants.SEG): # load previous pairs: # previous pairs are pairs that have been pseudo-labeled in the # previous al round. they are ready to be used as # pseudo-segmented samples. no statistical constraints will be # applied on them. fz = join(COMMON, 'train_pairs_{}.pkl'.format(args.al_it - 1)) with open(fz, 'rb') as fp: previous_pairs = pkl.load(fp) train_samples = [] rootpath = get_rootpath_2_dataset(args) for fx in lfiles: # load using the current host-root-path. train_samples.extend(csv_loader(fx, rootpath, drop_normal=cnd_drop_n ) ) # Force: set all the samples in train_samples to L. for tt in range(len(train_samples)): train_samples[tt][4] = constants.L # ============== block to delete ======================================= # in the case we skipped previous rounds because we restart the # code, if we are in cc and use node, the paths will not match # since they are built upon the job id. so, we need to change it. if "CC_CLUSTER" in os.environ.keys(): for i in range(len(train_samples)): front = os.sep.join(train_samples[i][1].split(os.sep)[:3]) cnd = (front != os.environ["SLURM_TMPDIR"]) if cnd: # update the image input path train_samples[i][1] = train_samples[i][1].replace( front, os.environ["SLURM_TMPDIR"] ) if args.task == constants.SEG: # update the mask path train_samples[i][2] = train_samples[i][2].replace( front, os.environ["SLURM_TMPDIR"] ) previous_errors = True # TODO: remove the above block. no longer necessary. # since we use relative paths in the node, we shouldn't have # mismatching paths when restarting the code. assert not previous_errors, "ERROR." # ====================================================================== set_default_seed() for i in range(100): random.shuffle(train_samples) set_default_seed() return train_samples, previous_pairs, previous_errors