Example #1
0
    def update_locals(_pickled_cache, locls):
        if type(_pickled_cache) is dict:
            it_consumes(
                locls.__setitem__(k, v) for k, v in iteritems(_pickled_cache))
        else:
            raise TypeError("_pickled cache isn't dict")
        # elif key is not None: locls[key] = _pickled_cache

        return _pickled_cache
Example #2
0
def construct_worst_per_image(series, root_directory, new_df):
    def g(val):
        fname = construct_filename(root_directory, val.image_position,
                                   val.folder_name)

        if pd.isnull(val.choice):
            return val

        new_idx = categories.index(val.choice)

        try:
            cur_fname = new_df.get(fname, None)
        except TypeError:
            cur_fname = None

        if construct_worst_per_image.t > 0:
            construct_worst_per_image.t -= 1
            just = 35
            print(
                "val:".ljust(just),
                "{!r}\n".format(val),
                "val.choice:".ljust(just),
                "{!r}\n".format(val.choice),
                "fname:".ljust(just),
                "{!r}\n".format(fname),
                "new_df[fname]:".ljust(just),
                "{!r}\n".format(cur_fname),
                sep="",
            )
            print(
                "categories.index(val.choice):".ljust(just),
                "{!r}\n".format(new_idx),
                sep="",
            )
            print(
                "categories.index(new_df[fname]):".ljust(just),
                "{!r}\n".format(cur_fname if cur_fname is None else categories.
                                index(cur_fname)),
                sep="",
            )

        cur_idx = (cur_fname if cur_fname is None or pd.isnull(cur_fname) else
                   categories.index(cur_fname))

        new_df[fname] = (val.choice if any((
            fname not in new_df,
            pd.isnull(cur_fname),
            pd.isnull(cur_idx),
            cur_idx is None or cur_idx < new_idx,
        )) else new_df[fname])

        return val

    it_consumes(map(g, series.values))
    return series
Example #3
0
    def partition_symlink(series):
        def g(filename_category):
            filename, category = filename_category

            if pd.isnull(filename):
                return category

            symlinks.append((filename, category))
            return category

        it_consumes(map(g, series.items()))
        return series
Example #4
0
def make_symlinks(dest_dir, filenames, clean_dir=False):
    if path.isdir(dest_dir):
        if clean_dir:
            rmtree(dest_dir)
            makedirs(dest_dir)  # no goto :(
    else:
        makedirs(dest_dir)

    it_consumes(
        imap(
            lambda fname: symlink(fname,
                                  path.join(dest_dir, path.basename(fname))),
            filenames,
        ))
Example #5
0
def main(
    root_directory, manual_dir
):  # type: (str, str or None) -> (str, pd.DataFrame, pd.Series, pd.DataFrame)
    ensure_is_dir(root_directory)

    it_consumes(
        map(mkdir,
            filterfalse(path.isdir, (path.dirname(manual_dir), manual_dir))))

    paths = "Fundus Photographs for AI", "DR SPOC Dataset", "DR SPOC Photo Dataset"
    if path.basename(root_directory) == paths[0]:
        root_directory = path.dirname(root_directory)
    elif path.basename(root_directory) == paths[1]:
        root_directory = path.dirname(path.dirname(root_directory))
    elif path.basename(root_directory) == paths[2]:
        root_directory = path.dirname(
            path.dirname(path.dirname(root_directory)))

    levels = list(reversed(paths))
    if root_directory.endswith(path.join(*paths)):
        for _ in range(len(levels)):
            root_directory = path.dirname(root_directory)
    prev = path.join(root_directory, levels.pop())
    while len(levels):
        ensure_is_dir(prev)
        prev = path.join(prev, levels.pop())
    del levels, prev

    db_df = handle_db(root_directory=root_directory)
    df = handle_spreadsheet(root_directory=root_directory)
    filename2cat = pd.Series()
    df.apply(construct_worst_per_image, args=(root_directory, filename2cat))

    combined_df = combine_spreadsheet_db(db_df=db_df,
                                         filename2cat=filename2cat)

    # combined_df.apply(partition_symlink, 1)

    if manual_dir is None or (
            path.realpath(manual_dir) == path.realpath(root_directory)
            and "{sep}symlinked_datasets{sep}".format(sep=path.sep)
            not in manual_dir):
        manual_dir = path.join(root_directory, "symlinked_datasets")

    symbolically_link(manual_dir, combined_df)

    return root_directory, df, filename2cat, combined_df
Example #6
0
def combine_spreadsheet_db(
        filename2cat,
        db_df):  # type: (pd.Series, pd.DataFrame) -> pd.DataFrame
    def g(idx_val):
        idx, val = idx_val
        assert isinstance(val, str), "Got type {!r} containing {!r}".format(
            type(val), val)
        if g.t > 0:
            g.t -= 1
            print("val:", val, "\n", "idx:", idx, "\n", sep="")
        if idx in g.db_df.index:
            if g.tt > 0:
                g.tt -= 1
                # print('categories.index({!r}):'.format(val), categories.index(val), '\n',
                #      'categories.index({!r}):'.format(g.db_df.loc[idx].category),
                #      categories.index(g.db_df.loc[idx].category), '\n'
                #      )
            if categories.index(val) < categories.index(
                    g.db_df.loc[idx].category):
                g.db_df.loc[idx].category = val
                g.changed_cond += 1
                if g.tt > 0:
                    print(
                        "db_df.loc[{!r}].category is now".format(idx),
                        g.db_df.loc[idx].category,
                    )
            g.changed += 1
        else:
            if g.tt > 0:
                print("{!r} not found in {!r}".format(idx, g.db_df.index))
            g.db_df = g.db_df.append(pd.Series({"category": val}, name=idx))
            # db_df[idx] = val
        return val

    g.t = 0
    g.tt = 0
    g.changed_cond = 0
    g.changed = 0
    g.db_df = db_df

    it_consumes(map(g, filename2cat.items()))

    # display(HTML(g.db_df.to_html()))
    assert len(g.db_df.index) == 1574, "Actually got {:d}".format(
        len(g.db_df.index))

    return g.db_df
Example #7
0
def _vanilla_stats(skip_save=True):
    global pickled_cache
    cache.update_locals(pickled_cache, locals())

    tbl = pickled_cache["tbl"]  # type: dict
    assert len(tbl.keys()) > 0
    sas_tbl = pickled_cache["sas_tbl"]  # type: dict
    id2ideyefname = pickled_cache["id2ideyefname"]

    if "oags1" not in pickled_cache or not len(pickled_cache["oags1"]):
        pickled_cache["oags1"] = oags1 = tuple(v.rec.IDNUM
                                               for v in itervalues(tbl)
                                               if v.rec.oag1)
    else:
        oags1 = pickled_cache[
            "oags1"]  # Weird, this doesn't get into locals() from `update_locals`

    if "loags1" not in pickled_cache or not len(pickled_cache["loags1"]):
        pickled_cache["loag1"] = loag1 = tuple(v.rec.IDNUM
                                               for v in itervalues(tbl)
                                               if v.rec.loag1)
        pickled_cache["roag1"] = roag1 = tuple(v.rec.IDNUM
                                               for v in itervalues(tbl)
                                               if v.rec.roag1)
    else:
        loag1 = pickled_cache["loag1"]
        roag1 = pickled_cache["roag1"]

    if "no_oags1" not in pickled_cache or not len(pickled_cache["no_oags1"]):
        pickled_cache["no_oags1"] = no_oags1 = tuple(v.rec.IDNUM
                                                     for v in itervalues(tbl)
                                                     if not v.rec.oag1)

    if "_vanilla_stats" not in pickled_cache or not len(
            pickled_cache["_vanilla_stats"]):
        pickled_cache["_vanilla_stats"] = vanilla_stats = "\n".join(
            "{0}{1}".format(*t) for t in (
                ("# total:".ljust(just), len(tbl)),
                ("# with oag1:".ljust(just), len(oags1)),
                ("# with roag1:".ljust(just), len(roag1)),
                ("# with loag1:".ljust(just), len(loag1)),
                (
                    "# with oag1 and roag1 and loag1:".ljust(just),
                    sum(1 for v in itervalues(tbl)
                        if v.rec.oag1 and v.rec.roag1 and v.rec.loag1),
                ),
                (
                    "# with oag1 and roag1 and loag1 and glaucoma4:".ljust(
                        just),
                    sum(1 for v in itervalues(tbl) if v.rec.oag1
                        and v.rec.roag1 and v.rec.loag1 and v.rec.glaucoma4),
                ),
                ("# len(sas_tbl) == len(tbl):".ljust(just),
                 len(sas_tbl) == len(tbl)),
            ))
        skip_save or cache.save(pickled_cache)

    it_consumes(imap(logger.debug,
                     pickled_cache["_vanilla_stats"].split("\n")))
    logger.debug("oags1:".ljust(just) + "{}".format(oags1))
    logger.debug("loag1:".ljust(just) + "{}".format(loag1))

    if "loags_id2fname" not in pickled_cache or not len(
            pickled_cache["loags_id2fname"]):
        id2fname = lambda dataset, eye: (lambda l: dict(izip(l[::2], l[
            1::2])))(tuple(
                chain.from_iterable(
                    imap(
                        lambda idnum_group: (
                            idnum_group[0],
                            tuple(imap(itemgetter(1), idnum_group[1])),
                        ),
                        groupby(
                            chain.from_iterable(
                                imap(
                                    lambda ideyefnames: tuple(
                                        imap(
                                            lambda ideyefname: (
                                                ideyefname.id,
                                                ideyefname.fname,
                                            ),
                                            ideyefnames,
                                        )),
                                    imap(
                                        lambda ideyefnames: ifilter(
                                            lambda ideyefname: ideyefname.eye
                                            == eye,
                                            ideyefnames,
                                        ),
                                        imap(
                                            lambda idnum: id2ideyefname[idnum],
                                            dataset),
                                    ),
                                )),
                            key=itemgetter(0),
                        ),
                    ))))

        pickled_cache["loags_id2fname"] = loags_id2fname = id2fname(
            dataset=loag1, eye="L")
        pickled_cache["roags_id2fname"] = roags_id2fname = id2fname(
            dataset=roag1, eye="R")
    else:
        loags_id2fname = pickled_cache["loags_id2fname"]
        roags_id2fname = pickled_cache["roags_id2fname"]
    # pp(loags_id2fname)

    logger.debug("generated_types.T0._fields:".ljust(just) +
                 "{}".format(generated_types.T0._fields))
    return pickled_cache
Example #8
0
def symbolically_link(symlink_dir,
                      df):  # type: (str, pd.DataFrame) -> pd.DataFrame
    if symbolically_link.t > 0:
        symbolically_link.t -= 1
        print("symbolically_link::symlink_dir:".ljust(just),
              "{!r}".format(symlink_dir))
    vc = df.apply(pd.value_counts)

    # 75% in train
    # 12.5% in test
    # 12.5% in validation
    target_counts = pd.DataFrame({
        "train":
        pd.Series({
            idx: np.uint16(np.floor(np.multiply(vc.loc[idx].category, 0.75)))
            for idx in vc.category.index
        }),
        "test":
        pd.Series({
            idx: np.uint16(np.floor(np.multiply(vc.loc[idx].category, 0.125)))
            for idx in vc.category.index
        }),
    })
    target_counts["valid"] = pd.Series({
        idx: vc.loc[idx].category - sum(
            (lambda ser: (ser.train, ser.test))(target_counts.loc[idx]))
        for idx in vc.category.index
    })

    symlinks = []

    if not path.isdir(symlink_dir):
        makedirs(symlink_dir)

    def partition_symlink(series):
        def g(filename_category):
            filename, category = filename_category

            if pd.isnull(filename):
                return category

            symlinks.append((filename, category))
            return category

        it_consumes(map(g, series.items()))
        return series

    partition_symlink.t = 0

    df.apply(partition_symlink)

    _used = set()

    _uniq_syms = tuple((src, dst) for src, dst in symlinks
                       if src not in _used and (_used.add(src) or True))

    random_list = get_or_generate_and_store_random_list(
        len(_uniq_syms),
        path.join(path.dirname(path.dirname(__file__)), "_data", ".cache",
                  "dr_spoc_rand.pkl"),
    )
    uniq_syms = tuple(_uniq_syms[i] for i in random_list)
    assert len(uniq_syms) == len(_uniq_syms)

    target_counts_cp = target_counts.copy()

    def get_next_tier(index):  # type: (str) -> str
        for column in target_counts.columns:
            if target_counts[column][index] > 0:
                target_counts[column][index] -= 1
                return column
        raise StopIteration("No more {!r}".format(index))

    def tier_syms(filename_category):
        filename, category = (filename_category if isinstance(
            filename_category, tuple) else
                              (filename_category,
                               df.loc[filename_category].category))

        current_tier = get_next_tier(category)
        this_filename = "_".join(
            (path.basename(path.dirname(filename)), path.basename(filename)))

        all_labels_dir = path.join(
            symlink_dir,
            dr_spoc_datasets[dr_spoc_datasets.index("dr_spoc")],
            current_tier,
            category,
        )

        grad_and_no_grad_dir = path.join(
            symlink_dir,
            dr_spoc_datasets[dr_spoc_datasets.index(
                "dr_spoc_grad_and_no_grad")],
            current_tier,
        )

        no_no_grad_dir = path.join(
            symlink_dir,
            dr_spoc_datasets[dr_spoc_datasets.index("dr_spoc_no_no_grad")],
            current_tier,
            category,
        )

        if not path.isdir(all_labels_dir):
            makedirs(all_labels_dir)

        all_labels_dst = path.join(all_labels_dir, this_filename)

        try:
            symlink(filename, all_labels_dst, target_is_directory=False)
        except FileExistsError:
            tier_syms.FileExistsError += 1

        with suppress(FileExistsError):
            label = category if category == "No gradable image" else "gradable"
            grad_and_no_grad_dir = path.join(grad_and_no_grad_dir, label)
            if not path.isdir(grad_and_no_grad_dir):
                makedirs(grad_and_no_grad_dir)
            grad_and_no_grad_dst = path.join(grad_and_no_grad_dir,
                                             this_filename)
            symlink(filename, grad_and_no_grad_dst, target_is_directory=False)
            if label != "No gradable image":
                if not path.isdir(no_no_grad_dir):
                    makedirs(no_no_grad_dir)
                no_no_grad_dir_dst = path.join(no_no_grad_dir, this_filename)
                symlink(filename,
                        no_no_grad_dir_dst,
                        target_is_directory=False)

        if tier_syms.t > 0:
            tier_syms.t -= 1
            print("filename: {!r}\ncategory: {!r}\n".format(
                filename, category),
                  sep="")

    tier_syms.t = 0
    tier_syms.FileExistsError = 0

    assert tier_syms.FileExistsError in (0, 1573)

    print("symlink_dir:".ljust(20), "{!r}".format(symlink_dir), sep="")

    it_consumes(map(tier_syms, uniq_syms))

    return target_counts_cp