def main(image_directory: str, annotation_directory: str, output_path_training_split: str, output_path_validation_split: str, output_path_test_split: str, label_map_path: str, number_of_shards: int, included_classes: List[str]): os.makedirs(os.path.dirname(output_path_training_split), exist_ok=True) label_map_dict = label_map_util.get_label_map_dict(label_map_path) all_jpg_image_paths = glob(f"{image_directory}/**/*.jpg", recursive=True) all_png_image_paths = glob(f"{image_directory}/**/*.png", recursive=True) all_image_paths = all_jpg_image_paths + all_png_image_paths all_annotation_paths = glob(f"{annotation_directory}/**/*.json", recursive=True) # Filter out the dataset.json files, which are complete dataset annotations all_annotation_paths = [a for a in all_annotation_paths if "dataset.json" not in a] training_sample_indices, validation_sample_indices, test_sample_indices = get_training_validation_test_indices( all_image_paths) all_annotation_paths = sorted(all_annotation_paths) all_image_paths = sorted(all_image_paths) if len(all_image_paths) != len(all_annotation_paths): print("Not every image has annotations") for annotation_path, image_path in zip(all_annotation_paths, all_image_paths): if os.path.splitext(os.path.basename(image_path))[0] not in annotation_path: print("Invalid annotations detected: {0}, {1}".format(image_path, annotation_path)) print(f"Exporting\n" f"- {len(training_sample_indices)} training samples\n" f"- {len(validation_sample_indices)} validation samples\n" f"- {len(test_sample_indices)} test samples") with contextlib2.ExitStack() as tf_record_close_stack: training_tf_records = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path_training_split, number_of_shards) validation_tf_records = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path_validation_split, number_of_shards) test_tf_records = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path_test_split, number_of_shards) index = 0 for tf_example in annotations_to_tf_example_list(all_image_paths, all_annotation_paths, label_map_dict, included_classes): shard_index = index % number_of_shards index += 1 if index in training_sample_indices: training_tf_records[shard_index].write(tf_example.SerializeToString()) elif index in validation_sample_indices: validation_tf_records[shard_index].write(tf_example.SerializeToString()) elif index in test_sample_indices: test_tf_records[shard_index].write(tf_example.SerializeToString())
def create_tf_record(output_filename, num_shards, label_map_dict, annotations_dir, image_dir, examples): """Creates a TFRecord file from examples. Args: output_filename: Path to where output file is saved. num_shards: Number of shards for output file. label_map_dict: The label map dictionary. annotations_dir: Directory where annotation files are stored. image_dir: Directory where image files are stored. examples: Examples to parse and save to tf record. """ with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): if idx % 100 == 0: logging.info('On image %d of %d', idx, len(examples)) mask_path = os.path.join(annotations_dir, example + '.png') image_path = os.path.join(image_dir, example + '.jpg') try: tf_example = dict_to_tf_example(example, mask_path, label_map_dict, image_path) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write( tf_example.SerializeToString()) print("done") except ValueError: logging.warning('Invalid example: %s, ignoring.', xml_path)
def main(image_dir, csv_input, output_path, num_shards=1): # writer = tf.python_io.TFRecordWriter(output_path) path = image_dir examples = pd.read_csv(csv_input) grouped = split(examples, 'filename') num = 0 # for group in grouped: # num += 1 # tf_example = create_tf_example(group, path) # writer.write(tf_example.SerializeToString()) # if num % 100 == 0: # 每完成100个转换,打印一次 # print(num) # num_shards = 10 # output_filebase = os.path.join(output_path,'record') with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) for index, group in enumerate(grouped): num += 1 tf_example = create_tf_example(group, path) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString()) if num % 100 == 0: # 每完成100个转换,打印一次 print(num) # writer.close() # output_path = os.path.join(os.getcwd(), FLAGS.output_path) print('Successfully created the TFRecords: {}'.format(output_path))
def write_tf_record_shard(self, path_tf_record, path_yaml, dir_yaml_data, num_shards): with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, path_tf_record, num_shards) #output_filebase, num_shards) examples = yaml.load(open(path_yaml, 'rb').read()) count = 0 #for index, example in examples: for example in examples: #print("index:", index) print("example:", example) key_file = None if (self.input_type == self.INPUT_TYPES[IDX_BOSCH]): key_file = 'path' else if (self.input_type == self.INPUT_TYPES[IDX_SLOTH]): key_file = 'filename' filename = example['path'] filename = os.path.abspath(os.path.join(os.path.dirname(dir_yaml_data), filename)) #filename = "C:/Work_BigData/Bosch_Small_Traffic_Lights_Dataset/dataset_train_rgb/" + filename #print(filename) if (not os.path.exists(filename)): print(filename, " does not exist.") continue count = count + 1 if MAX_COUNT_DATA != 0 and count > MAX_COUNT_DATA: break tf_example = None if (self.input_type == self.INPUT_TYPES[IDX_BOSCH]): tf_example = self.create_tf_example_bosch(example, filename) else if (self.input_type == self.INPUT_TYPES[IDX_SLOTH]): tf_example = self.create_tf_example_sloth(example, filename) #output_shard_index = index % num_shards output_shard_index = count % num_shards output_tfrecords[output_shard_index].write(tf_example.SerializeToString())
def convert_to_tf_records(images_data: List[ImageData], label_map: Dict[str, int], filepath: Union[str, Path], num_workers: int = 1, num_shards: int = 2, max_pictures_per_worker: int = 1000, use_thumbnail: Tuple[int, int] = None): logger.info('Create tf_records from data.') data_chunks = np.array_split(images_data, max_pictures_per_worker) with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = creation_util.open_sharded_output_tfrecords( exit_stack=tf_record_close_stack, base_path=filepath, num_shards=num_shards) for data_chunk in tqdm(data_chunks): tf_records = Parallel(n_jobs=num_workers)( delayed(tf_record_from_image_data)(image_data=image_data, label_map=label_map, use_thumbnail=use_thumbnail) for image_data in data_chunk) for index, tf_record in enumerate(tf_records): output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_record.SerializeToString()) logger.info( f"tf_records saved to {filepath}-?????-of-{str(num_shards).zfill(5)}.")
def create_tf_record(output_filename, num_shards, label_map_dict, image_dir, examples): """Creates a TFRecord file from examples. Args: output_filename: Path to where output file is saved. num_shards: Number of shards for output file. label_map_dict: The label map dictionary. annotations_dir: Directory where annotation files are stored. image_dir: Directory where image files are stored. examples: Examples to parse and save to tf record. faces_only: If True, generates bounding boxes for pet faces. Otherwise generates bounding boxes (as well as segmentations for full pet bodies). mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to smaller file sizes. """ with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): if idx % 100 == 0: logging.info('On image %d of %d', idx, len(examples)) try: tf_example = dict_to_tf_example(example, label_map_dict, image_dir) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write( tf_example.SerializeToString()) except ValueError: logging.warning('Invalid example: %s, ignoring.', xml_path)
def main(_): labels = dict(Person=1, Car=2) train = Path('./object_detection/smart_spy/dataset/train') test = Path('./object_detection/smart_spy/dataset/test') # train = Path("/tmp/ds/train") # test = Path("/tmp/ds/test") for ds in [train, test]: num_shards = 50 output_filebase = f"./object_detection/smart_spy/dataset/tfrecords/{ds.name}/tf.record" image_path_list = chain(ds.glob("Person/*.jpg"), ds.glob("Car/*.jpg")) examples = [ to_image_info(i, i.parent.name, labels[i.parent.name]) for i in image_path_list ] with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filebase, num_shards ) for index, example in enumerate(examples): tf_example = create_tf_example(example) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write(tf_example.SerializeToString())
def main(_): writer = tf.python_io.TFRecordWriter(FLAGS.output_path) path = os.path.join(FLAGS.image_dir) examples = pd.read_csv(FLAGS.csv_input) grouped = split(examples, 'filename') # for group in grouped: # tf_example = create_tf_example(group, path) # writer.write(tf_example.SerializeToString()) # # writer.close() # output_path = os.path.join(os.getcwd(), FLAGS.output_path) # print('Successfully created the TFRecords: {}'.format(output_path)) num_shards = 100 with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, FLAGS.output_path, num_shards) index = 0 for example in grouped: tf_example = create_tf_example(example, path) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString()) index = index + 1
def create_tf_record(image_dir_path, annotations_dir_path, tfrecord_dir_path, label_map_dict, images_filename): """Creates a TFRecord file from data. Args: image_dir_path: Directory where image files are stored. annotations_dir_path: Directory where annotation files are stored. tfrecord_dir_path: Path to where output file is saved. label_map_dict: The label map dictionary. images_filename: Examples to parse and save to tf record. """ masks_dir_path = os.path.join(annotations_dir_path, FLAGS.masks_dir) xmls_dir_path = os.path.join(annotations_dir_path, FLAGS.xmls_dir) with contextlib2.ExitStack() as tf_record_close_stack: tfrecord_path = os.path.join(tfrecord_dir_path, FLAGS.tfrecord_filename) output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, tfrecord_path, FLAGS.num_shards) for idx, filename in enumerate(images_filename): if idx % 100 == 0: logging.info('On image %d of %d', idx, len(images_filename)) image_path = os.path.join(image_dir_path, filename + '.jpg') mask_path = os.path.join(masks_dir_path, filename + '.png') xml_path = os.path.join(xmls_dir_path, filename + '.xml') try: tf_example = image_to_tf_data(image_path, mask_path, xml_path, label_map_dict, filename) if tf_example: shard_idx = idx % FLAGS.num_shards output_tfrecords[shard_idx].write( tf_example.SerializeToString()) logging.info('done') except ValueError: logging.warning('Invalid example: %s, ignoring.', image_path)
def create_tf_record_from_images_list(images, annotations_index, dataset_base_dir, category_index, original_category_index, output_path): num_shards = 1 + (len(images) // FLAGS.images_per_shard) total_annot_skipped = 0 total_empty_annot_skipped = 0 total_image_skipped = 0 with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) for index, image in enumerate(images): image_id = image['id'] if image_id not in annotations_index: annotations_index[image_id] = [] tf_example, annot_skipped, empty_annot_skipped = create_tf_example( image, dataset_base_dir, annotations_index[image_id], category_index, original_category_index) total_annot_skipped += annot_skipped total_empty_annot_skipped += empty_annot_skipped if tf_example is not None: output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString()) else: total_image_skipped += 1 tf.compat.v1.logging.info('Finished writing, skipped %d bboxes.', total_annot_skipped) tf.compat.v1.logging.info('Skipped %d bboxes on empty images.', total_empty_annot_skipped) tf.compat.v1.logging.info('%d images not found.', total_image_skipped)
def run(images_path, description_file, output_path, no_bbox=False): f = open(description_file) i = 0 if no_bbox: writer = tf.python_io.TFRecordWriter(output_path) while True: try: tf_example = parse_test_example(f, images_path) writer.write(tf_example.SerializeToString()) i += 1 except IOError: break except Exception: raise writer.close() else: num_shards = 10 print("Processing {}".format(images_path)) with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) while True: try: tf_example = parse_example(f, images_path) output_shard_index = i % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString()) i += 1 except IOError: break except Exception: raise print("Correctly created record for {} images\n".format(i))
def _create_tf_record_from_bdd_annotations(annotations_file, image_dir, output_path, num_shards): """Loads BDD annotation json files and converts to tf.Record format. Args: annotations_file: JSON file containing bounding box annotations. image_dir: Directory containing the image files. output_path: Path to output tf.Record file. num_shards: number of output file shards. """ with contextlib2.ExitStack() as tf_record_close_stack, \ tf.gfile.GFile(annotations_file, 'r') as fid: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) groundtruth_data = json.load(fid) category_index = label_map_util.create_category_index_from_labelmap( 'bdd_label_map.pbtxt', use_display_name=False) image_list = os.listdir(image_dir) total_num_annotations_skipped = 0 for idx, frame in enumerate(groundtruth_data): if idx % 100 == 0: tf.logging.info('On image {} of {}'.format( idx + 1, len(image_list))) tf_example, num_annotations_skipped = create_tf_example( frame, image_dir, category_index) total_num_annotations_skipped += num_annotations_skipped shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString()) tf.logging.info('Finished writing, skipped {} annotations.'.format( total_num_annotations_skipped))
def _create_tf_record_from_coco_annotations(annotations_file, image_dir, output_path, include_masks, num_shards): with contextlib2.ExitStack() as tf_record_close_stack, tf.gfile.GFile( annotations_file, 'r') as fid: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) groundtruth_data = json.load(fid) images = groundtruth_data['images'] category_index = label_map_util.create_category_index( groundtruth_data['categories']) annotations_index = {} if 'annotations' in groundtruth_data: for annotation in groundtruth_data['annotations']: image_id = annotation['image_id'] if image_id not in annotations_index: annotations_index[image_id] = [] annotations_index[image_id].append(annotation) for idx, image in enumerate(images): annotations_list = annotations_index[image['id']] tf_example = create_tf_example(image, annotations_list, image_dir, category_index, include_masks) shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString())
def process_list(path, output_path, num_shards, min_size, do_write=True): global total_faces global num_filtered content = [l.strip() for l in open(path)] examples = [] i = 0 while i < len(content): path = os.path.join(r'X:\wider-face\WIDER_all\images', content[i]) i += 1 num_boxes = int(content[i]) i += 1 boxes = [] for i in range(i, i + num_boxes): total_faces += 1 box = [int(a) for a in content[i].split()[:4]] if (min_size == 0) or (box[2] > min_size and box[3] > min_size): boxes += [box] else: num_filtered += 1 i += 1 examples += [(path, boxes)] if do_write: with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) for index, example in tqdm(enumerate(examples), total=len(examples)): tf_example = create_tf_example(example) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString()) else: return examples
def create_tf_record(output_filename, num_shards, label_map_dict, annotations_dir, image_dir, examples): with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): if idx % 100 == 0: logging.info('On image %d of %d', idx, len(examples)) xml_path = os.path.join(annotations_dir, 'xmls', example + '.xml') if not os.path.exists(xml_path): logging.warning('Could not find %s, ignoring example.', xml_path) continue with tf.gfile.GFile(xml_path, 'r') as fid: xml_str = fid.read() xml = etree.fromstring(xml_str) data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation'] try: tf_example = dict_to_tf_example( data, label_map_dict, image_dir) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString()) except ValueError: logging.warning('Invalid example: %s, ignoring.', xml_path)
def main(_): record_files = list_records(FLAGS.path_to_records) logging.debug("Number of records to be processed: {}".format( len(record_files))) next_example = read_records(record_files, shuffle=FLAGS.shuffle) # To maxmize file I/O throughout, split the training data into pieces. with contextlib2.ExitStack() as tf_record_close_stack: output_records = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, FLAGS.output_file, FLAGS.num_shards) with tf.Session() as sess: index = 0 while True: try: serialized_example = sess.run(next_example) index += 1 output_shard_index = index % FLAGS.num_shards output_records[output_shard_index].write( serialized_example) logging.debug("Samples processed: {}".format(index)) except tf.errors.OutOfRangeError: break
def create_tf_record(output_filename, num_shards, label_map_dict, annotations_dir, image_dir, examples, faces_only=True, mask_type='png'): """Creates a TFRecord file from examples. Args: output_filename: Path to where output file is saved. num_shards: Number of shards for output file. label_map_dict: The label map dictionary. annotations_dir: Directory where annotation files are stored. image_dir: Directory where image files are stored. examples: Examples to parse and save to tf record. faces_only: If True, generates bounding boxes for pet faces. Otherwise generates bounding boxes (as well as segmentations for full pet bodies). mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to smaller file sizes. """ with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): if idx % 100 == 0: logging.info('On image %d of %d', idx, len(examples)) list_file_name = example.split('.') json_file = os.path.join(annotations_dir, '.'.join(list_file_name[:-1]) + '.json') with open(json_file) as f: json_data = json.load(f) # xml_path = os.path.join(annotations_dir, 'xmls', example + '.xml') # mask_path = os.path.join(annotations_dir, 'trimaps', example + '.png') # if not os.path.exists(xml_path): # logging.warning('Could not find %s, ignoring example.', xml_path) # continue # with tf.gfile.GFile(xml_path, 'r') as fid: # xml_str = fid.read() # xml = etree.fromstring(xml_str) # data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation'] try: tf_example = dict_to_tf_example(json_data, label_map_dict, image_dir, example, faces_only=faces_only, mask_type=mask_type) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write( tf_example.SerializeToString()) except ValueError: logging.warning('Invalid example: %s, ignoring.', json_file)
def _create_tf_record_from_wfs_annotations(annotations_file, image_root_dir, output_path, include_masks, num_shards): with contextlib2.ExitStack() as tf_record_close_stack, tf.gfile.GFile( annotations_file, 'r') as fid: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) groundtruth_data = json.load(fid) images = groundtruth_data['images'] category_index = label_map_util.create_category_index( groundtruth_data['categories']) annotations_index = {} if 'annotations' in groundtruth_data: tf.logging.info( 'Found groundtruth annotations. Building annotations index.') for annotation in groundtruth_data['annotations']: image_id = annotation['image_id'] if image_id not in annotations_index: annotations_index[image_id] = [] annotations_index[image_id].append(annotation) missing_annotation_count = 0 for image in images: image_id = image['id'] if image_id not in annotations_index: missing_annotation_count += 1 annotations_index[image_id] = [] tf.logging.info('%d images are missing annotations.', missing_annotation_count) total_num_annotations_skipped = 0 not_found_images = 0 for idx, image in enumerate(images): # if idx % 100 == 0: # tf.logging.info('On image %d of %d / Not found %d', idx, len(images), not_found_images) # print('On image {} of {} / Not found {}'.format(idx + 1, len(images), not_found_images), end='\r') annotations_list = annotations_index[image['id']] category_name = category_index[annotations_list[0] ['category_id']]['name'] category_folder = get_directory_name(category_name) image_dir = os.path.join(image_root_dir, category_folder) result, tf_example, num_annotations_skipped = create_tf_example( image, annotations_list, image_dir, category_index, include_masks) if result is None: not_found_images += 1 continue total_num_annotations_skipped += num_annotations_skipped shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString()) print("\n") tf.logging.info('Finished writing, skipped %d annotations.', total_num_annotations_skipped)
def create_tf_record( output_filename, num_shards, label_map_dict, annotations_dir, image_dir, examples, faces_only=True, mask_type="png", ): """Creates a TFRecord file from examples. Args: output_filename: Path to where output file is saved. num_shards: Number of shards for output file. label_map_dict: The label map dictionary. annotations_dir: Directory where annotation files are stored. image_dir: Directory where image files are stored. examples: Examples to parse and save to tf record. faces_only: If True, generates bounding boxes for pet faces. Otherwise generates bounding boxes (as well as segmentations for full pet bodies). mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to smaller file sizes. """ with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): if idx % 100 == 0: logging.info("On image %d of %d", idx, len(examples)) xml_path = os.path.join(annotations_dir, "xmls", example + ".xml") mask_path = os.path.join(annotations_dir, "trimaps", example + ".png") if not os.path.exists(xml_path): logging.warning("Could not find %s, ignoring example.", xml_path) continue with tf.gfile.GFile(xml_path, "r") as fid: xml_str = fid.read() xml = etree.fromstring(xml_str) data = dataset_util.recursive_parse_xml_to_dict(xml)["annotation"] try: tf_example = dict_to_tf_example( data, mask_path, label_map_dict, image_dir, faces_only=faces_only, mask_type=mask_type, ) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write( tf_example.SerializeToString()) except ValueError: logging.warning("Invalid example: %s, ignoring.", xml_path)
def write_records(examples, output_path, num_shards): with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) for index, example in tqdm(enumerate(examples), total=len(examples)): tf_example = create_tf_example(example) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString())
def encode_to_tfr_record(test_feature, out_tfr_file): num_shards = 100 with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, out_tfr_file, num_shards) for index, example in enumerate(test_feature): tf_example = create_tf_example(example) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString())
def _create_tf_record_from_coco_annotations(annotations_file, image_dir, output_path, include_masks, num_shards): """Loads COCO annotation json files and converts to tf.Record format. Args: annotations_file: JSON file containing bounding box annotations. image_dir: Directory containing the image files. output_path: Path to output tf.Record file. include_masks: Whether to include instance segmentations masks (PNG encoded) in the result. default: False. num_shards: number of output file shards. """ with contextlib2.ExitStack() as tf_record_close_stack, \ tf.gfile.GFile(annotations_file, 'r') as fid: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) groundtruth_data = json.load(fid) images = groundtruth_data['images'] category_index = label_map_util.create_category_index( groundtruth_data['categories']) annotations_index = {} if 'annotations' in groundtruth_data: tf.logging.info( 'Found groundtruth annotations. Building annotations index.') for annotation in groundtruth_data['annotations']: image_id = annotation['image_id'] if image_id not in annotations_index: annotations_index[image_id] = [] annotations_index[image_id].append(annotation) missing_annotation_count = 0 for image in images: image_id = image['id'] if image_id not in annotations_index: missing_annotation_count += 1 annotations_index[image_id] = [] tf.logging.info('%d images are missing annotations.', missing_annotation_count) total_num_annotations_skipped = 0 for idx, image in enumerate(images): if idx % 100 == 0: tf.logging.info('On image %d of %d', idx, len(images)) annotations_list = annotations_index[image['id']] _, tf_example, num_annotations_skipped, is_empty = create_tf_example( image, annotations_list, image_dir, category_index, include_masks) total_num_annotations_skipped += num_annotations_skipped shard_idx = idx % num_shards if not is_empty: output_tfrecords[shard_idx].write( tf_example.SerializeToString()) tf.logging.info('Finished writing, skipped %d annotations.', total_num_annotations_skipped)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) required_flags = [ 'input_box_annotations_csv', 'input_images_directory', 'input_label_map', 'output_tf_record_path_prefix' ] for flag_name in required_flags: if not getattr(FLAGS, flag_name): raise ValueError('Flag --{} is required'.format(flag_name)) label_map = label_map_util.get_label_map_dict(FLAGS.input_label_map) all_box_annotations = pd.read_csv(FLAGS.input_box_annotations_csv) if FLAGS.input_image_label_annotations_csv: all_label_annotations = pd.read_csv( FLAGS.input_image_label_annotations_csv) all_label_annotations.rename( columns={'Confidence': 'ConfidenceImageLabel'}, inplace=True) else: all_label_annotations = None all_images = tf.gfile.Glob( os.path.join(FLAGS.input_images_directory, '*.jpg')) all_image_ids = [ os.path.splitext(os.path.basename(v))[0] for v in all_images ] all_image_ids = pd.DataFrame({'ImageID': all_image_ids}) all_annotations = pd.concat( [all_box_annotations, all_image_ids, all_label_annotations]) tf.logging.log(tf.logging.INFO, 'Found %d images...', len(all_image_ids)) with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, FLAGS.output_tf_record_path_prefix, FLAGS.num_shards) for counter, image_data in enumerate( all_annotations.groupby('ImageID')): tf.logging.log_every_n(tf.logging.INFO, 'Processed %d images...', 1000, counter) image_id, image_annotations = image_data # In OID image file names are formed by appending ".jpg" to the image ID. image_path = os.path.join(FLAGS.input_images_directory, image_id + '.jpg') with tf.gfile.Open(image_path) as image_file: encoded_image = image_file.read() tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame( image_annotations, label_map, encoded_image) if tf_example: shard_idx = int(image_id, 16) % FLAGS.num_shards output_tfrecords[shard_idx].write( tf_example.SerializeToString())
def _create_tf_record_from_rsna_set(parsed, pids, num_shards, record_name): with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, os.path.join(FLAGS.output_dir, record_name), num_shards) for index, pid in enumerate(pids): data = parsed[pid] tf_example = create_tf_example(FLAGS.dicom_dir, pid, data['label'], data['boxes']) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString())
def shard_tf(output_filebase, examples, num_shards=10): count = len(examples) with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filebase, num_shards) for index, example in enumerate(examples): print("Creating example {}/{}".format(index, count), end="\r") tf_example = create_tf_example(example) output_shard_index = index % num_shards output_tfrecords[output_shard_index].write( tf_example.SerializeToString())
def create_tf_record(output_filename, file_pars): # Your code here with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, 1) for data, label in file_pars: try: tf_example = dict_to_tf_example(data,label) if tf_example: output_tfrecords[0].write(tf_example.SerializeToString()) except ValueError: logging.warning('Invalid example: %s, ignoring.', xml_path)
def create_tf_record(output_filename, num_shards, label_map_dict, annotations_dir, image_dir, examples, faces_only=True, mask_type='png'): """Creates a TFRecord file from examples. Args: output_filename: Path to where output file is saved. num_shards: Number of shards for output file. label_map_dict: The label map dictionary. annotations_dir: Directory where annotation files are stored. image_dir: Directory where image files are stored. examples: Examples to parse and save to tf record. faces_only: If True, generates bounding boxes for pet faces. Otherwise generates bounding boxes (as well as segmentations for full pet bodies). mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to smaller file sizes. """ with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): if idx % 100 == 0: logging.info('On image %d of %d', idx, len(examples)) xml_path = os.path.join(annotations_dir, 'xmls', example + '.xml') mask_path = os.path.join(annotations_dir, 'trimaps', example + '.png') if not os.path.exists(xml_path): logging.warning('Could not find %s, ignoring example.', xml_path) continue with tf.gfile.GFile(xml_path, 'r') as fid: xml_str = fid.read() xml = etree.fromstring(xml_str) data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation'] try: tf_example = dict_to_tf_example( data, mask_path, label_map_dict, image_dir, faces_only=faces_only, mask_type=mask_type) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString()) except ValueError: logging.warning('Invalid example: %s, ignoring.', xml_path)
def create_tf_record(output_filename, num_shards, label_map_dict, annotations_dir, image_dir, examples, mask_type='png'): """Creates a TFRecord file from examples. Args: output_filename: Path to where output file is saved. num_shards: Number of shards for output file. label_map_dict: The label map dictionary. annotations_dir: Directory where annotation files are stored. image_dir: Directory where image files are stored. examples: Examples to parse and save to tf record. mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to smaller file sizes. """ with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, f_example in enumerate(examples): if idx % 100 == 0: logging.info('On image %d of %d', idx, len(examples)) example = os.path.splitext(f_example)[0] xml_path = os.path.join(annotations_dir, 'xmls', example + '.xml') mask_path = os.path.join(annotations_dir, 'trimaps', example + '.png') if not os.path.exists(xml_path): logging.warning('Could not find %s, ignoring example.', xml_path) continue with tf.gfile.GFile(xml_path, 'r') as fid: xml_str = fid.read() xml = etree.fromstring(xml_str) data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation'] print('xml_path:', xml_path) try: tf_example = dict_to_tf_example(data, mask_path, label_map_dict, image_dir, mask_type=mask_type) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write( tf_example.SerializeToString()) except ValueError: logging.warning('Invalid example: %s, ignoring.', xml_path)
def test_sharded_tfrecord_writes(self): with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, os.path.join(tf.test.get_temp_dir(), 'test.tfrec'), 10) for idx in range(10): output_tfrecords[idx].write('test_{}'.format(idx)) for idx in range(10): tf_record_path = '{}-{:05d}-of-00010'.format( os.path.join(tf.test.get_temp_dir(), 'test.tfrec'), idx) records = list(tf.python_io.tf_record_iterator(tf_record_path)) self.assertAllEqual(records, ['test_{}'.format(idx)])
def _create_tf_record_from_coco_annotations( annotations_file, image_dir, output_path, include_masks, num_shards): """Loads COCO annotation json files and converts to tf.Record format. Args: annotations_file: JSON file containing bounding box annotations. image_dir: Directory containing the image files. output_path: Path to output tf.Record file. include_masks: Whether to include instance segmentations masks (PNG encoded) in the result. default: False. num_shards: number of output file shards. """ with contextlib2.ExitStack() as tf_record_close_stack, \ tf.gfile.GFile(annotations_file, 'r') as fid: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) groundtruth_data = json.load(fid) images = groundtruth_data['images'] category_index = label_map_util.create_category_index( groundtruth_data['categories']) annotations_index = {} if 'annotations' in groundtruth_data: tf.logging.info( 'Found groundtruth annotations. Building annotations index.') for annotation in groundtruth_data['annotations']: image_id = annotation['image_id'] if image_id not in annotations_index: annotations_index[image_id] = [] annotations_index[image_id].append(annotation) missing_annotation_count = 0 for image in images: image_id = image['id'] if image_id not in annotations_index: missing_annotation_count += 1 annotations_index[image_id] = [] tf.logging.info('%d images are missing annotations.', missing_annotation_count) total_num_annotations_skipped = 0 for idx, image in enumerate(images): if idx % 100 == 0: tf.logging.info('On image %d of %d', idx, len(images)) annotations_list = annotations_index[image['id']] _, tf_example, num_annotations_skipped = create_tf_example( image, annotations_list, image_dir, category_index, include_masks) total_num_annotations_skipped += num_annotations_skipped shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString()) tf.logging.info('Finished writing, skipped %d annotations.', total_num_annotations_skipped)
def gen_tfrecord(panda_df, output_path, num_shards = 10): """Creates a TFRecord of the current dataframe into the output file""" with contextlib2.ExitStack() as tf_record_close_stack: writer = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_path, num_shards) grouped = split(panda_df, 'filename') for idx, group in enumerate(grouped): if idx % 100 == 0: print("On image " + str(idx) + " of " + str(len(grouped))) tf_example = create_tf_example(group, "./images/raw") shard_idx = idx % num_shards writer[shard_idx].write(tf_example.SerializeToString()) print("Successfully creates the TFRecords: {}".format(output_path))
def create_tf_record(output_filename, num_shards, label_map_dict, annotations_dir, image_dir, examples, use_alt_names): """Creates a TFRecord file from examples. Args: output_filename: Path to where output file is saved. num_shards: Number of shards for output file. label_map_dict: The label map dictionary. annotations_dir: Directory where annotation files are stored. image_dir: Directory where image files are stored. examples: Examples to parse and save to tf record. use_alt_names: use alternative class name mapping. """ with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): if idx % 10 == 0: logger.info('On image %d of %d', idx, len(examples)) xml_path = os.path.join(annotations_dir, 'xmls', example + '.xml') if not os.path.exists(xml_path): logger.warning('Could not find %s, ignoring example.', xml_path) continue with tf.io.gfile.GFile(xml_path, 'r') as fid: xml_str = fid.read() xml = etree.fromstring(xml_str) data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation'] try: tf_example = dict_to_tf_example( data=data, label_map_dict=label_map_dict, image_subdirectory=image_dir, use_alt_names=use_alt_names) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString()) except ValueError: logger.warning('Invalid example: %s, ignoring.', xml_path)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) required_flags = [ 'input_box_annotations_csv', 'input_images_directory', 'input_label_map', 'output_tf_record_path_prefix' ] for flag_name in required_flags: if not getattr(FLAGS, flag_name): raise ValueError('Flag --{} is required'.format(flag_name)) label_map = label_map_util.get_label_map_dict(FLAGS.input_label_map) all_box_annotations = pd.read_csv(FLAGS.input_box_annotations_csv) if FLAGS.input_image_label_annotations_csv: all_label_annotations = pd.read_csv(FLAGS.input_image_label_annotations_csv) all_label_annotations.rename( columns={'Confidence': 'ConfidenceImageLabel'}, inplace=True) else: all_label_annotations = None all_images = tf.gfile.Glob( os.path.join(FLAGS.input_images_directory, '*.jpg')) all_image_ids = [os.path.splitext(os.path.basename(v))[0] for v in all_images] all_image_ids = pd.DataFrame({'ImageID': all_image_ids}) all_annotations = pd.concat( [all_box_annotations, all_image_ids, all_label_annotations]) tf.logging.log(tf.logging.INFO, 'Found %d images...', len(all_image_ids)) with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, FLAGS.output_tf_record_path_prefix, FLAGS.num_shards) for counter, image_data in enumerate(all_annotations.groupby('ImageID')): tf.logging.log_every_n(tf.logging.INFO, 'Processed %d images...', 1000, counter) image_id, image_annotations = image_data # In OID image file names are formed by appending ".jpg" to the image ID. image_path = os.path.join(FLAGS.input_images_directory, image_id + '.jpg') with tf.gfile.Open(image_path) as image_file: encoded_image = image_file.read() tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame( image_annotations, label_map, encoded_image) if tf_example: shard_idx = int(image_id, 16) % FLAGS.num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString())
def create_tf_record(output_filename, num_shards, examples): with contextlib2.ExitStack() as tf_record_close_stack: output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( tf_record_close_stack, output_filename, num_shards) for idx, example in enumerate(examples): img_path = os.path.join(read_bucket, example) if not os.path.isfile(img_path): continue with tf.gfile.GFile(img_path, 'rb') as fid: encoded_jpg = fid.read() encoded_jpg_io = io.BytesIO(encoded_jpg) image = PIL.Image.open(encoded_jpg_io) if image.format != 'JPEG': raise ValueError('Image format not JPEG') key = hashlib.sha256(encoded_jpg).hexdigest() width, height = image.size xmins = [] xmaxs = [] ymins = [] ymaxs = [] classes_text = [] # 'coke', 'pepsi', 'coke'... classes = [] # 1, 2, 1... difficult_obj = [] truncated = [] poses = [] for annotation in annotations[example]: xmins.append(annotation['x']) xmaxs.append(annotation['x2']) ymins.append(annotation['y']) ymaxs.append(annotation['y2']) classes_text.append(annotation['label'].encode('utf8')) classes.append(1) # temporary, I need to assign labels to actual ids difficult_obj.append(0) truncated.append(0) poses.append(''.encode('utf8')) try: feature_dict = { 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(example.encode('utf8')), 'image/source_id': dataset_util.bytes_feature(example.encode('utf8')), 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), 'image/encoded': dataset_util.bytes_feature(encoded_jpg), 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), 'image/object/truncated': dataset_util.int64_list_feature(truncated), 'image/object/view': dataset_util.bytes_list_feature(poses) } tf_example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) if tf_example: shard_idx = idx % num_shards output_tfrecords[shard_idx].write(tf_example.SerializeToString()) except ValueError: print('Invalid example, ignoring.')