Beispiel #1
0
def do_camelyon16():
    """
    camelyon16.
    The train/valid/test sets are already provided.

    :return:
    """
    # ===============
    # Reproducibility
    # ===============

    # ===========================

    reproducibility.init_seed()

    # ===========================

    ds = constants.CAM16
    announce_msg("Processing dataset: {}".format(ds))
    args = {
        "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset': ds})),
        "dataset": ds,
        "fold_folder": "folds/{}".format(ds),
        "img_extension": "jpg",
        "path_encoding": "folds/{}/encoding-origine.yaml".format(ds)
    }
    # Convert masks into binary masks: already done.
    # create_bin_mask_Oxford_flowers_102(Dict2Obj(args))
    reproducibility.init_seed()
    al_split_camelyon16(Dict2Obj(args))
Beispiel #2
0
def do_Oxford_flowers_102():
    """
    Oxford-flowers-102.
    The train/valid/test sets are already provided.

    :return:
    """
    # ===============
    # Reproducibility
    # ===============

    # ===========================

    reproducibility.init_seed()

    # ===========================

    announce_msg("Processing dataset: {}".format(constants.OXF))
    args = {
        "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset':
                                                    constants.OXF})),
        "dataset": "Oxford-flowers-102",
        "fold_folder": "folds/Oxford-flowers-102",
        "img_extension": "jpg",
        "path_encoding": "folds/Oxford-flowers-102/encoding-origine.yaml"
    }
    # Convert masks into binary masks: already done.
    # create_bin_mask_Oxford_flowers_102(Dict2Obj(args))
    reproducibility.init_seed()
    al_split_Oxford_flowers_102(Dict2Obj(args))
    get_stats(Dict2Obj(args), split=0, fold=0, subset='train')
Beispiel #3
0
def do_glas():
    """
    GlaS.

    :return:
    """
    # ===============
    # Reproducibility
    # ===============
    reproducibility.init_seed()

    announce_msg("Processing dataset: {}".format(constants.GLAS))

    args = {
        "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset':
                                                    constants.GLAS})),
        "folding": {
            "vl": 20
        },  # 80 % for train, 20% for validation.
        "dataset": "glas",
        "fold_folder": "folds/glas",
        "img_extension": "bmp",
        # nbr_splits: how many times to perform the k-folds over
        # the available train samples.
        "nbr_splits": 1
    }
    args["nbr_folds"] = math.ceil(100. / args["folding"]["vl"])

    reproducibility.init_seed()
    al_split_glas(Dict2Obj(args))
    get_stats(Dict2Obj(args), split=0, fold=0, subset='train')
Beispiel #4
0
def get_stats(args, split, fold, subset):
    """
    Get some stats on the image sizes of specific dataset, split, fold.
    """
    if not os.path.isdir(args.fold_folder):
        os.makedirs(args.fold_folder)

    tag = "ds-{}-s-{}-f-{}-subset-{}".format(args.dataset, split, fold, subset)
    log = open(join(args.fold_folder, "log-stats-ds-{}.txt".format(tag)), 'w')
    announce_msg("Going to check {}".format(args.dataset.upper()))

    relative_fold_path = join(args.fold_folder, "split_{}".format(split),
                              "fold_{}".format(fold))

    subset_csv = join(relative_fold_path,
                      "{}_s_{}_f_{}.csv".format(subset, split, fold))
    rootpath = get_rootpath_2_dataset(args)
    samples = csv_loader(subset_csv, rootpath)

    lh, lw = [], []
    for el in samples:
        img = Image.open(el[1], 'r').convert('RGB')
        w, h = img.size
        lh.append(h)
        lw.append(w)

    msg = "min h {}, \t max h {}".format(min(lh), max(lh))
    show_msg(msg, log)
    msg = "min w {}, \t max w {}".format(min(lw), max(lw))
    show_msg(msg, log)

    fig, axes = plt.subplots(nrows=1, ncols=2)
    axes[0].hist(lh)
    axes[0].set_title('Heights')
    axes[1].hist(lw)
    axes[1].set_title('Widths')
    fig.tight_layout()
    plt.savefig(join(args.fold_folder, "size-stats-{}.png".format(tag)))

    log.close()
def clear_rootpath(samples, args):
    """
    Remove the rootpath from the samples (img, mask) to be host independent.
    RETURNS A COPY OF THE SAMPLES UPDATED.

    :param samples: list of samples where each sample is a list. See format
    of samples for datasets.
        # 0. id: float, a unique id of the sample within the entire dataset.
        # 1. path_img: str, path to the image.
        # 2. path_mask: str or None, path to the mask if there is any.
        # Otherwise, None.
        # 3. label: int, the class label of the sample.
        # 4. tag: int in {0, 1, 2} where: 0: the samples belongs to the
        # supervised set (L). 1: The  sample belongs to the unsupervised set
        # (U). 2: The sample belongs to the set of newly labeled samples (
        # L'). This sample came from U and was labeling following a specific
        # mechanism.
    :param args: object. args of main.py.
    :return: a COPY list of samples (not inplace modification)
    """
    spls = deepcopy(samples)
    rootpath = get_rootpath_2_dataset(args)

    for i, sm in enumerate(spls):
        img_pth = sm[1]
        mask_pth = sm[2]
        l_pths = [img_pth, mask_pth]
        for j, pth in enumerate(l_pths):
            if pth:  # not None.
                pth = pth.replace(rootpath, '')
                if pth.startswith(os.path.sep):  # remove /
                    pth = pth[1:]

                l_pths[j] = pth

        img_pth, mask_pth = l_pths
        spls[i][1] = img_pth
        spls[i][2] = mask_pth

    return spls
Beispiel #6
0
def do_Caltech_UCSD_Birds_200_2011():
    """
    Caltech-UCSD-Birds-200-2011.

    :return:
    """
    # ===============
    # Reproducibility
    # ===============

    # ===========================

    reproducibility.init_seed()

    # ===========================

    announce_msg("Processing dataset: {}".format(constants.CUB))

    args = {
        "baseurl": get_rootpath_2_dataset(Dict2Obj({'dataset':
                                                    constants.CUB})),
        "folding": {
            "vl": 20
        },  # 80 % for train, 20% for validation.
        "dataset": "Caltech-UCSD-Birds-200-2011",
        "fold_folder": "folds/Caltech-UCSD-Birds-200-2011",
        "img_extension": "bmp",
        "nbr_splits": 1,  # how many times to perform the k-folds over
        # the available train samples.
        "path_encoding":
        "folds/Caltech-UCSD-Birds-200-2011/encoding-origine.yaml",
        "nbr_classes": None  # Keep only 5 random classes. If you want
        # to use the entire dataset, set this to None.
    }
    args["nbr_folds"] = math.ceil(100. / args["folding"]["vl"])
    reproducibility.init_seed()
    al_split_Caltech_UCSD_Birds_200_2011(Dict2Obj(args))
    get_stats(Dict2Obj(args), split=0, fold=0, subset='train')
Beispiel #7
0
        "valid_s_" + str(args.split) + "_f_" + str(args.fold) + ".csv")
    test_csv = join(
        relative_fold_path,
        "test_s_" + str(args.split) + "_f_" + str(args.fold) + ".csv")

    # Check if the csv files exist. If not, raise an error.
    if not all([
            os.path.isfile(train_csv),
            os.path.isfile(valid_csv),
            os.path.isfile(test_csv)
    ]):
        raise ValueError("Missing *.cvs files ({}[{}], {}[{}], {}[{}])".format(
            train_csv, os.path.isfile(train_csv), valid_csv,
            os.path.isfile(valid_csv), test_csv, os.path.isfile(test_csv)))

    rootpath = get_rootpath_2_dataset(args)

    train_samples = csv_loader(train_csv, rootpath)
    valid_samples = csv_loader(valid_csv, rootpath)
    test_samples = csv_loader(test_csv, rootpath)

    # Just for debug to go fast.
    if DEBUG_MODE and (args.dataset == "Caltech-UCSD-Birds-200-2011"):
        reproducibility.force_seed(int(os.environ["MYSEED"]))
        warnings.warn("YOU ARE IN DEBUG MODE!!!!")
        train_samples = random.sample(train_samples, 100)
        valid_samples = random.sample(valid_samples, 5)
        test_samples = test_samples[:20]
        reproducibility.force_seed(int(os.environ["MYSEED"]))

    if DEBUG_MODE and (args.dataset == "Oxford-flowers-102"):
def get_init_sup_samples(args,
                         sampler,
                         COMMON,
                         train_samples,
                         OUTD
                         ):
    """
    Get the initial full supervised data.
    :return:
    """
    previous_pairs = dict()
    previous_errors = False

    # drop normal samples and keep metastatic if: 1. dataset=CAM16. 2.
    # al_type != AL_WSL.
    cnd_drop_n = (args.dataset == constants.CAM16)
    cnd_drop_n &= (args.al_type != constants.AL_WSL)

    # round 0
    cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL])
    cnd &= (args.al_it == 0)

    if  cnd:
        # deterministic function with respect to the original seed.
        set_default_seed()
        train_samples = sampler.sample_init_random_samples(train_samples)
        set_default_seed()
        # store on disc: remove the rootpath from files to be host-independent.
        # store relative paths not absolute.
        base_f = 'train_{}.csv'.format(args.al_it)
        al_outf = join(COMMON, base_f)
        csv_writer(clear_rootpath(train_samples, args),
                   al_outf
                   )
        shutil.copyfile(al_outf, join(OUTD, base_f))

    # round > 0: combine all the samples of the previous al rounds
    # and the selected samples for this round.
    cnd = (args.al_type not in [constants.AL_FULL_SUP, constants.AL_WSL])
    cnd &= (args.al_it > 0)
    if cnd:
        # 'train_{i}.csv' contains the selected samples at round i.
        lfiles = [join(
            COMMON, 'train_{}.csv'.format(t)) for t in range(args.al_it + 1)]

        if (args.al_type == constants.AL_LP) and (args.task == constants.SEG):
            # load previous pairs:
            # previous pairs are pairs that have been pseudo-labeled in the
            # previous al round. they are ready to be used as
            # pseudo-segmented samples. no statistical constraints will be
            # applied on them.
            fz = join(COMMON, 'train_pairs_{}.pkl'.format(args.al_it - 1))
            with open(fz, 'rb') as fp:
                previous_pairs = pkl.load(fp)

        train_samples = []
        rootpath = get_rootpath_2_dataset(args)
        for fx in lfiles:
            # load using the current host-root-path.
            train_samples.extend(csv_loader(fx,
                                            rootpath,
                                            drop_normal=cnd_drop_n
                                            )
                                 )

        # Force: set all the samples in train_samples to L.
        for tt in range(len(train_samples)):
            train_samples[tt][4] = constants.L

        # ============== block to delete =======================================
        # in the case we skipped previous rounds because we restart the
        # code, if we are in cc and use node, the paths will not match
        # since they are built upon the job id. so, we need to change it.
        if "CC_CLUSTER" in os.environ.keys():
            for i in range(len(train_samples)):
                front = os.sep.join(train_samples[i][1].split(os.sep)[:3])
                cnd = (front != os.environ["SLURM_TMPDIR"])
                if cnd:
                    # update the image input path
                    train_samples[i][1] = train_samples[i][1].replace(
                        front, os.environ["SLURM_TMPDIR"]
                    )

                    if args.task == constants.SEG:
                        # update the mask path
                        train_samples[i][2] = train_samples[i][2].replace(
                            front, os.environ["SLURM_TMPDIR"]
                        )

                    previous_errors = True

            # TODO: remove the above block. no longer necessary.
            # since we use relative paths in the node, we shouldn't have
            # mismatching paths when restarting the code.
            assert not previous_errors, "ERROR."
        # ======================================================================

        set_default_seed()
        for i in range(100):
            random.shuffle(train_samples)
        set_default_seed()

    return train_samples, previous_pairs, previous_errors