Beispiel #1
0
def extract_archive_file(archive_fn, im_dir):
    if not os.path.exists(im_dir) or not os.listdir(im_dir):
        # Dataset is not deployed. Deploy it.
        archive_fns = archive_fn
        # A dataset may be composed of several tgz files, or only one.
        # If one, make it into a list to make the code later more general
        if not isinstance(archive_fns, list):
            archive_fns = [archive_fns]
        logger.info("Extracting datasets {} to local machine at {}".format(
            archive_fns, im_dir))
        if not os.path.exists(im_dir):
            os.makedirs(im_dir, exist_ok=True)

        for archive_fn in archive_fns:
            # Extract the tgz file directly into the target directory,
            # without precopy.
            # Note that the tgz file contains a root directory that
            # we do not want, hence the strip-components=1
            commandUnpack = ("tar -mxzf {src_file} -C {tgt_dir} "
                             "--strip-components=1").format(
                                 src_file=archive_fn, tgt_dir=im_dir)

            assert not subprocess.call(
                shlex.split(commandUnpack)), "Failed to unpack"
            logger.info("Extracted {}".format(archive_fn))
Beispiel #2
0
def convert_coco_text_to_coco_detection_json(source_json,
                                             target_json,
                                             set_type=None,
                                             min_img_size=100,
                                             text_cat_id=1):
    """
    This function converts a COCOText style JSON to a COCODetection style
    JSON.
    For COCOText see: https://vision.cornell.edu/se3/coco-text-2/
    For COCODetection see: http://cocodataset.org/#overview
    """
    with open(source_json, "r") as f:
        coco_text_json = json.load(f)

    coco_text_json["annotations"] = list(coco_text_json["anns"].values())
    coco_text_json["images"] = list(coco_text_json["imgs"].values())
    if set_type is not None:
        # COCO Text style JSONs often mix test, train, and val sets.
        # We need to make sure we only use the data type we want.
        coco_text_json["images"] = [
            x for x in coco_text_json["images"] if x["set"] == set_type
        ]
    coco_text_json["categories"] = [{"name": "text", "id": text_cat_id}]
    del coco_text_json["cats"]
    del coco_text_json["imgs"]
    del coco_text_json["anns"]
    for ann in coco_text_json["annotations"]:
        ann["category_id"] = text_cat_id
        ann["iscrowd"] = 0
        # Don't evaluate the model on illegible words
        if set_type == "val" and ann["legibility"] != "legible":
            ann["ignore"] = True
    # Some datasets seem to have extremely small images which break downstream
    # operations. If min_img_size is set, we can remove these.
    coco_text_json["images"] = [
        x for x in coco_text_json["images"]
        if x["height"] >= min_img_size and x["width"] >= min_img_size
    ]
    # Remap image_ids if necessary
    if isinstance(coco_text_json["images"][0]["id"], str):
        image_id_remap = {
            x["id"]: id_no
            for (id_no, x) in enumerate(coco_text_json["images"])
        }
        for x in coco_text_json["images"]:
            x["id"] = image_id_remap[x["id"]]
        for x in coco_text_json["annotations"]:
            if x["image_id"] in image_id_remap:
                x["image_id"] = image_id_remap[x["image_id"]]

    os.makedirs(os.path.dirname(target_json), exist_ok=True)
    with open(target_json, "w") as f:
        json.dump(coco_text_json, f)

    return coco_text_json