def create_test_data(data_dir, data_types): """performs moving samples from several datasets to test dataset""" destination_data_type = "test2017" destination_dir = get_imgs_dir_filename(data_dir, destination_data_type) # create test img dir if does not exist if not os.path.exists(destination_dir): os.makedirs(destination_dir) destination_json_filename = get_annotations_json_filename( data_dir, destination_data_type, prefix="cleaned") destination_samples = {"imgs": [], "anns": []} for data_type in data_types.values(): json_filename = get_annotations_json_filename(data_dir, data_type, prefix="cleaned") splitted_samples = get_splitted_samples(json_filename) destination_samples["imgs"].extend( splitted_samples["destination"]["imgs"]) destination_samples["anns"].extend( splitted_samples["destination"]["anns"]) json_data = get_json_data(splitted_samples["origin"]) save_json(json_data, json_filename) origin_dir = get_imgs_dir_filename(data_dir, data_type) move_images(origin_dir, destination_dir, splitted_samples["destination"]["imgs"]) json_data = get_json_data(destination_samples) save_json(json_data, destination_json_filename)
def create_cleaned_annotations(data_dir, data_types): """perform cleaning and saving of annotations""" for data_type in data_types.values(): old_json_filename = get_annotations_json_filename(data_dir, data_type) imgs_dir = get_imgs_dir_filename(data_dir, data_type) edited_json = get_edited_json(old_json_filename, imgs_dir) new_json_filename = get_annotations_json_filename( data_dir, data_type, "cleaned") with open(new_json_filename, "w") as file: json.dump(edited_json, file)
def check_dataset(data_dir, data_type, captions_per_image): """check dataset for correctness""" hdf5_filename = get_hdf5_filename(data_dir, data_type) json_filename = get_annotations_json_filename(data_dir, data_type, prefix="cleaned") coco = COCO(json_filename) img_ids = coco.getImgIds() print(len(img_ids)) captions_json_filename = get_captions_json_filename(data_dir, data_type, prefix="cleaned") with open(captions_json_filename) as cap_file: captions = json.load(cap_file)["anns"] random_indices = np.random.choice(range(len(captions)), 5) rand_caps = [captions[idx] for idx in random_indices] random_img_indices = [idx // captions_per_image for idx in random_indices] with h5py.File(hdf5_filename, "r") as hdf5_file: images = hdf5_file["images"] print(len(images)) rand_images = [images[idx] for idx in random_img_indices] for i, img in enumerate(rand_images): img = img.transpose((1, 2, 0)) imgplot = plt.imshow(img) plt.title(rand_caps[i]["caption"]) plt.show()
def create_image_hdf5_datasets(data_dir, data_types, captions_per_image): """create hdf5 dataset for image directory""" for data_type in data_types.values(): json_filename = get_annotations_json_filename(data_dir, data_type, prefix="cleaned") coco = COCO(json_filename) img_ids = coco.getImgIds() hdf5_filename = get_hdf5_filename(data_dir, data_type) with h5py.File(hdf5_filename, "a") as hdf5_file: # Make a note of the number of captions we are sampling per image hdf5_file.attrs["captions_per_image"] = captions_per_image # Create dataset inside HDF5 file to store images dataset_shape = (len(img_ids), 3, 256, 256) images = hdf5_file.create_dataset("images", shape=dataset_shape, dtype="uint8", compression="gzip", compression_opts=9) for i, img_id in enumerate(img_ids): img_filename = coco.imgs[img_id]["file_name"] impath = get_img_filename(data_dir, data_type, img_filename) img = get_transformed_img(impath) # Save image to HDF5 file images[i] = img if i % 1000 == 0: print("{}-th iteration is written".format(i)) print("{} is written".format(hdf5_filename))
def create_trimmed_annotations(data_dir, data_types, captions_per_image): """limit captions per image and remove images from json filename""" for data_type in data_types.values(): json_filename = get_annotations_json_filename(data_dir, data_type, prefix="cleaned") samples = get_list_trimmed_annotations(json_filename, captions_per_image) json_data = get_json_data(samples) save_json(json_data, json_filename)
def get_image_paths(data_dir, data_type): """extract paths for images in right order""" json_filename = get_annotations_json_filename(data_dir, data_type, prefix="cleaned") coco = COCO(json_filename) img_ids = coco.getImgIds() img_paths = [] for img_id in img_ids: img_filename = coco.imgs[img_id]["file_name"] impath = get_img_filename(data_dir, data_type, img_filename) img_paths.append(impath) return img_paths
def create_only_captions_json(data_dir, data_types): """create json that only contains info about captions""" for data_type in data_types.values(): ann_json_filename = get_annotations_json_filename(data_dir, data_type, prefix="cleaned") coco = COCO(ann_json_filename) json_data = {} json_data["anns"] = list(coco.anns.values()) caption_json_filename = get_captions_json_filename(data_dir, data_type, prefix="cleaned") save_json(json_data, caption_json_filename)
def create_image_hdf5_datasets_in_parallel_mpi(data_dir, data_types, captions_per_image): """create hdf5 dataset for image directory""" comm = MPI.COMM_WORLD num_processes = comm.size # The process ID (integer 0-3 for 4-process run) rank = comm.rank for data_type in data_types.values(): json_filename = get_annotations_json_filename(data_dir, data_type, prefix="cleaned") coco = COCO(json_filename) img_ids = coco.getImgIds() hdf5_filename = get_hdf5_filename(data_dir, data_type) with h5py.File(hdf5_filename, "a", driver='mpio', comm=comm) as hdf5_file: # Make a note of the number of captions we are sampling per image hdf5_file.attrs["captions_per_image"] = captions_per_image # Create dataset inside HDF5 file to store images dataset_shape = (len(img_ids), 3, 256, 256) images = hdf5_file.create_dataset("images", shape=dataset_shape, dtype="uint8") for i, img_id in enumerate(img_ids): if i % num_processes == rank: img_filename = coco.imgs[img_id]["file_name"] impath = get_img_filename(data_dir, data_type, img_filename) img = get_transformed_img(impath) # Save image to HDF5 file images[i] = img if i % 1000 == 0: print("{}-th iteration is written".format(i)) if rank == 0: print("{} is written".format(hdf5_filename))