Exemple #1
0
def create_test_data(data_dir, data_types):
    """performs moving samples from several datasets to test dataset"""
    destination_data_type = "test2017"
    destination_dir = get_imgs_dir_filename(data_dir, destination_data_type)
    # create test img dir if does not exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)
    destination_json_filename = get_annotations_json_filename(
        data_dir, destination_data_type, prefix="cleaned")

    destination_samples = {"imgs": [], "anns": []}
    for data_type in data_types.values():
        json_filename = get_annotations_json_filename(data_dir,
                                                      data_type,
                                                      prefix="cleaned")
        splitted_samples = get_splitted_samples(json_filename)
        destination_samples["imgs"].extend(
            splitted_samples["destination"]["imgs"])
        destination_samples["anns"].extend(
            splitted_samples["destination"]["anns"])

        json_data = get_json_data(splitted_samples["origin"])
        save_json(json_data, json_filename)

        origin_dir = get_imgs_dir_filename(data_dir, data_type)
        move_images(origin_dir, destination_dir,
                    splitted_samples["destination"]["imgs"])

    json_data = get_json_data(destination_samples)
    save_json(json_data, destination_json_filename)
Exemple #2
0
def create_cleaned_annotations(data_dir, data_types):
    """perform cleaning and saving of annotations"""
    for data_type in data_types.values():
        old_json_filename = get_annotations_json_filename(data_dir, data_type)
        imgs_dir = get_imgs_dir_filename(data_dir, data_type)
        edited_json = get_edited_json(old_json_filename, imgs_dir)
        new_json_filename = get_annotations_json_filename(
            data_dir, data_type, "cleaned")
        with open(new_json_filename, "w") as file:
            json.dump(edited_json, file)
Exemple #3
0
def check_dataset(data_dir, data_type, captions_per_image):
    """check dataset for correctness"""
    hdf5_filename = get_hdf5_filename(data_dir, data_type)
    json_filename = get_annotations_json_filename(data_dir,
                                                  data_type,
                                                  prefix="cleaned")
    coco = COCO(json_filename)
    img_ids = coco.getImgIds()
    print(len(img_ids))
    captions_json_filename = get_captions_json_filename(data_dir,
                                                        data_type,
                                                        prefix="cleaned")

    with open(captions_json_filename) as cap_file:
        captions = json.load(cap_file)["anns"]

    random_indices = np.random.choice(range(len(captions)), 5)
    rand_caps = [captions[idx] for idx in random_indices]
    random_img_indices = [idx // captions_per_image for idx in random_indices]
    with h5py.File(hdf5_filename, "r") as hdf5_file:
        images = hdf5_file["images"]
        print(len(images))

        rand_images = [images[idx] for idx in random_img_indices]

    for i, img in enumerate(rand_images):
        img = img.transpose((1, 2, 0))
        imgplot = plt.imshow(img)
        plt.title(rand_caps[i]["caption"])
        plt.show()
Exemple #4
0
def create_image_hdf5_datasets(data_dir, data_types, captions_per_image):
    """create hdf5 dataset for image directory"""
    for data_type in data_types.values():
        json_filename = get_annotations_json_filename(data_dir,
                                                      data_type,
                                                      prefix="cleaned")
        coco = COCO(json_filename)
        img_ids = coco.getImgIds()
        hdf5_filename = get_hdf5_filename(data_dir, data_type)

        with h5py.File(hdf5_filename, "a") as hdf5_file:
            # Make a note of the number of captions we are sampling per image
            hdf5_file.attrs["captions_per_image"] = captions_per_image

            # Create dataset inside HDF5 file to store images
            dataset_shape = (len(img_ids), 3, 256, 256)
            images = hdf5_file.create_dataset("images",
                                              shape=dataset_shape,
                                              dtype="uint8",
                                              compression="gzip",
                                              compression_opts=9)

            for i, img_id in enumerate(img_ids):
                img_filename = coco.imgs[img_id]["file_name"]
                impath = get_img_filename(data_dir, data_type, img_filename)
                img = get_transformed_img(impath)
                # Save image to HDF5 file
                images[i] = img
                if i % 1000 == 0:
                    print("{}-th iteration is written".format(i))

        print("{} is written".format(hdf5_filename))
Exemple #5
0
def create_trimmed_annotations(data_dir, data_types, captions_per_image):
    """limit captions per image and remove images from json filename"""
    for data_type in data_types.values():
        json_filename = get_annotations_json_filename(data_dir,
                                                      data_type,
                                                      prefix="cleaned")
        samples = get_list_trimmed_annotations(json_filename,
                                               captions_per_image)
        json_data = get_json_data(samples)
        save_json(json_data, json_filename)
Exemple #6
0
def get_image_paths(data_dir, data_type):
    """extract paths for images in right order"""
    json_filename = get_annotations_json_filename(data_dir,
                                                  data_type,
                                                  prefix="cleaned")
    coco = COCO(json_filename)
    img_ids = coco.getImgIds()
    img_paths = []
    for img_id in img_ids:
        img_filename = coco.imgs[img_id]["file_name"]
        impath = get_img_filename(data_dir, data_type, img_filename)
        img_paths.append(impath)
    return img_paths
Exemple #7
0
def create_only_captions_json(data_dir, data_types):
    """create json that only contains info about captions"""
    for data_type in data_types.values():
        ann_json_filename = get_annotations_json_filename(data_dir,
                                                          data_type,
                                                          prefix="cleaned")
        coco = COCO(ann_json_filename)
        json_data = {}
        json_data["anns"] = list(coco.anns.values())
        caption_json_filename = get_captions_json_filename(data_dir,
                                                           data_type,
                                                           prefix="cleaned")
        save_json(json_data, caption_json_filename)
Exemple #8
0
def create_image_hdf5_datasets_in_parallel_mpi(data_dir, data_types,
                                               captions_per_image):
    """create hdf5 dataset for image directory"""
    comm = MPI.COMM_WORLD
    num_processes = comm.size
    # The process ID (integer 0-3 for 4-process run)
    rank = comm.rank
    for data_type in data_types.values():
        json_filename = get_annotations_json_filename(data_dir,
                                                      data_type,
                                                      prefix="cleaned")
        coco = COCO(json_filename)
        img_ids = coco.getImgIds()

        hdf5_filename = get_hdf5_filename(data_dir, data_type)

        with h5py.File(hdf5_filename, "a", driver='mpio',
                       comm=comm) as hdf5_file:
            # Make a note of the number of captions we are sampling per image
            hdf5_file.attrs["captions_per_image"] = captions_per_image

            # Create dataset inside HDF5 file to store images
            dataset_shape = (len(img_ids), 3, 256, 256)
            images = hdf5_file.create_dataset("images",
                                              shape=dataset_shape,
                                              dtype="uint8")

            for i, img_id in enumerate(img_ids):
                if i % num_processes == rank:
                    img_filename = coco.imgs[img_id]["file_name"]
                    impath = get_img_filename(data_dir, data_type,
                                              img_filename)
                    img = get_transformed_img(impath)
                    # Save image to HDF5 file
                    images[i] = img
                    if i % 1000 == 0:
                        print("{}-th iteration is written".format(i))

        if rank == 0:
            print("{} is written".format(hdf5_filename))