def _make_serialized_tf_sequence_example(self): num_frames = 4 image_height = 20 image_width = 30 image_source_ids = [str(i) for i in range(num_frames)] encoded_images = self._make_random_serialized_jpeg_images( num_frames, image_height, image_width) sequence_example_serialized = seq_example_util.make_sequence_example( dataset_name='video_dataset', video_id='video', encoded_images=encoded_images, image_height=image_height, image_width=image_width, image_source_ids=image_source_ids, image_format='JPEG', is_annotated=[[1], [1], [1], [1]], bboxes=[ [[]], # Frame 0. [[0., 0., 1., 1.]], # Frame 1. [[0., 0., 1., 1.], [0.1, 0.1, 0.2, 0.2]], # Frame 2. [[]], # Frame 3. ], label_strings=[ [], # Frame 0. ['Abyssinian'], # Frame 1. ['Abyssinian', 'american_bulldog'], # Frame 2. [], # Frame 3 ]).SerializeToString() return sequence_example_serialized
def create_tf_record_sequence_example(self): path = os.path.join(self.get_temp_dir(), 'seq_tfrecord') writer = tf.python_io.TFRecordWriter(path) num_frames = 4 image_height = 4 image_width = 5 image_source_ids = [str(i) for i in range(num_frames)] with self.test_session(): encoded_images = self._make_random_serialized_jpeg_images( num_frames, image_height, image_width) sequence_example_serialized = seq_example_util.make_sequence_example( dataset_name='video_dataset', video_id='video', encoded_images=encoded_images, image_height=image_height, image_width=image_width, image_source_ids=image_source_ids, image_format='JPEG', is_annotated=[[1], [1], [1], [1]], bboxes=[ [[]], # Frame 0. [[0., 0., 1., 1.]], # Frame 1. [[0., 0., 1., 1.], [0.1, 0.1, 0.2, 0.2]], # Frame 2. [[]], # Frame 3. ], label_strings=[ [], # Frame 0. ['Abyssinian'], # Frame 1. ['Abyssinian', 'american_bulldog'], # Frame 2. [], # Frame 3 ]).SerializeToString() writer.write(sequence_example_serialized) writer.close() return path
def graph_fn(): label_map_proto_file = os.path.join(self.get_temp_dir(), 'labelmap.pbtxt') self._create_label_map(label_map_proto_file) decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder( label_map_proto_file=label_map_proto_file) sequence_example_serialized = seq_example_util.make_sequence_example( dataset_name='video_dataset', video_id='video', encoded_images=encoded_images, image_height=image_height, image_width=image_width, image_format='JPEG', image_source_ids=[str(i) for i in range(num_frames)], is_annotated=[[1], [1], [1], [1]], bboxes=[ [[0., 0., 1., 1.]], # Frame 0. [[0.2, 0.2, 1., 1.], [0., 0., 1., 1.]], # Frame 1. [[0., 0., 1., 1.], # Frame 2. [0.1, 0.1, 0.2, 0.2]], [[]], # Frame 3. ], label_strings=[ ['fox'], # Frame 0. Fox will be filtered out. ['fox', 'dog'], # Frame 1. Fox will be filtered out. ['dog', 'cat'], # Frame 2. [], # Frame 3 ]).SerializeToString() example_string_tensor = tf.convert_to_tensor(sequence_example_serialized) return decoder.decode(example_string_tensor)
def graph_fn(): sequence_example_serialized = seq_example_util.make_sequence_example( dataset_name='video_dataset', video_id='video', encoded_images=encoded_images, image_height=image_height, image_width=image_width, image_format='JPEG', image_source_ids=[str(i) for i in range(num_frames)], bboxes=[ [[]], [[]], [[]], [[]] ], label_strings=[ [], [], [], [] ]).SerializeToString() example_string_tensor = tf.convert_to_tensor(sequence_example_serialized) label_map_proto_file = os.path.join(self.get_temp_dir(), 'labelmap.pbtxt') self._create_label_map(label_map_proto_file) decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder( label_map_proto_file=label_map_proto_file) return decoder.decode(example_string_tensor)
def test_make_unlabeled_example(self): num_frames = 5 image_height = 100 image_width = 200 dataset_name = b'unlabeled_dataset' video_id = b'video_000' images = tf.cast(tf.random.uniform( [num_frames, image_height, image_width, 3], maxval=256, dtype=tf.int32), dtype=tf.uint8) image_source_ids = [str(idx) for idx in range(num_frames)] images_list = tf.unstack(images, axis=0) encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list] encoded_images = self.materialize_tensors(encoded_images_list) seq_example = seq_example_util.make_sequence_example( dataset_name=dataset_name, video_id=video_id, encoded_images=encoded_images, image_height=image_height, image_width=image_width, image_format='JPEG', image_source_ids=image_source_ids) context_feature_dict = seq_example.context.feature self.assertEqual( dataset_name, context_feature_dict['example/dataset_name'].bytes_list.value[0]) self.assertEqual( 0, context_feature_dict['clip/start/timestamp'].int64_list.value[0]) self.assertEqual( num_frames - 1, context_feature_dict['clip/end/timestamp'].int64_list.value[0]) self.assertEqual( num_frames, context_feature_dict['clip/frames'].int64_list.value[0]) self.assertEqual( 3, context_feature_dict['image/channels'].int64_list.value[0]) self.assertEqual( b'JPEG', context_feature_dict['image/format'].bytes_list.value[0]) self.assertEqual( image_height, context_feature_dict['image/height'].int64_list.value[0]) self.assertEqual( image_width, context_feature_dict['image/width'].int64_list.value[0]) self.assertEqual( video_id, context_feature_dict['clip/media_id'].bytes_list.value[0]) seq_feature_dict = seq_example.feature_lists.feature_list self.assertLen( seq_feature_dict['image/encoded'].feature[:], num_frames) timestamps = [ feature.int64_list.value[0] for feature in seq_feature_dict['image/timestamp'].feature] self.assertAllEqual(list(range(num_frames)), timestamps) source_ids = [ feature.bytes_list.value[0] for feature in seq_feature_dict['image/source_id'].feature] self.assertAllEqual( [six.ensure_binary(str(idx)) for idx in range(num_frames)], source_ids)
def test_make_labeled_example_with_predictions(self): num_frames = 2 image_height = 100 image_width = 200 dataset_name = b'unlabeled_dataset' video_id = b'video_000' images = tf.cast(tf.random.uniform( [num_frames, image_height, image_width, 3], maxval=256, dtype=tf.int32), dtype=tf.uint8) images_list = tf.unstack(images, axis=0) encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list] encoded_images = self.materialize_tensors(encoded_images_list) bboxes = [ np.array([[0., 0., 0.75, 0.75], [0., 0., 1., 1.]], dtype=np.float32), np.array([[0., 0.25, 0.5, 0.75]], dtype=np.float32) ] label_strings = [ np.array(['cat', 'frog']), np.array(['cat']) ] detection_bboxes = [ np.array([[0., 0., 0.75, 0.75]], dtype=np.float32), np.zeros([0, 4], dtype=np.float32) ] detection_classes = [ np.array([5], dtype=np.int64), np.array([], dtype=np.int64) ] detection_scores = [ np.array([0.9], dtype=np.float32), np.array([], dtype=np.float32) ] seq_example = seq_example_util.make_sequence_example( dataset_name=dataset_name, video_id=video_id, encoded_images=encoded_images, image_height=image_height, image_width=image_width, bboxes=bboxes, label_strings=label_strings, detection_bboxes=detection_bboxes, detection_classes=detection_classes, detection_scores=detection_scores) context_feature_dict = seq_example.context.feature self.assertEqual( dataset_name, context_feature_dict['example/dataset_name'].bytes_list.value[0]) self.assertEqual( 0, context_feature_dict['clip/start/timestamp'].int64_list.value[0]) self.assertEqual( 1, context_feature_dict['clip/end/timestamp'].int64_list.value[0]) self.assertEqual( num_frames, context_feature_dict['clip/frames'].int64_list.value[0]) seq_feature_dict = seq_example.feature_lists.feature_list self.assertLen( seq_feature_dict['image/encoded'].feature[:], num_frames) actual_timestamps = [ feature.int64_list.value[0] for feature in seq_feature_dict['image/timestamp'].feature] self.assertAllEqual([0, 1], actual_timestamps) # Frame 0. self.assertAllEqual( 1, seq_feature_dict['region/is_annotated'].feature[0].int64_list.value[0]) self.assertAllClose( [0., 0.], seq_feature_dict['region/bbox/ymin'].feature[0].float_list.value[:]) self.assertAllClose( [0., 0.], seq_feature_dict['region/bbox/xmin'].feature[0].float_list.value[:]) self.assertAllClose( [0.75, 1.], seq_feature_dict['region/bbox/ymax'].feature[0].float_list.value[:]) self.assertAllClose( [0.75, 1.], seq_feature_dict['region/bbox/xmax'].feature[0].float_list.value[:]) self.assertAllEqual( [b'cat', b'frog'], seq_feature_dict['region/label/string'].feature[0].bytes_list.value[:]) self.assertAllClose( [0.], seq_feature_dict[ 'predicted/region/bbox/ymin'].feature[0].float_list.value[:]) self.assertAllClose( [0.], seq_feature_dict[ 'predicted/region/bbox/xmin'].feature[0].float_list.value[:]) self.assertAllClose( [0.75], seq_feature_dict[ 'predicted/region/bbox/ymax'].feature[0].float_list.value[:]) self.assertAllClose( [0.75], seq_feature_dict[ 'predicted/region/bbox/xmax'].feature[0].float_list.value[:]) self.assertAllEqual( [5], seq_feature_dict[ 'predicted/region/label/index'].feature[0].int64_list.value[:]) self.assertAllClose( [0.9], seq_feature_dict[ 'predicted/region/label/confidence'].feature[0].float_list.value[:]) # Frame 1. self.assertAllEqual( 1, seq_feature_dict['region/is_annotated'].feature[1].int64_list.value[0]) self.assertAllClose( [0.0], seq_feature_dict['region/bbox/ymin'].feature[1].float_list.value[:]) self.assertAllClose( [0.25], seq_feature_dict['region/bbox/xmin'].feature[1].float_list.value[:]) self.assertAllClose( [0.5], seq_feature_dict['region/bbox/ymax'].feature[1].float_list.value[:]) self.assertAllClose( [0.75], seq_feature_dict['region/bbox/xmax'].feature[1].float_list.value[:]) self.assertAllEqual( [b'cat'], seq_feature_dict['region/label/string'].feature[1].bytes_list.value[:]) self.assertAllClose( [], seq_feature_dict[ 'predicted/region/bbox/ymin'].feature[1].float_list.value[:]) self.assertAllClose( [], seq_feature_dict[ 'predicted/region/bbox/xmin'].feature[1].float_list.value[:]) self.assertAllClose( [], seq_feature_dict[ 'predicted/region/bbox/ymax'].feature[1].float_list.value[:]) self.assertAllClose( [], seq_feature_dict[ 'predicted/region/bbox/xmax'].feature[1].float_list.value[:]) self.assertAllEqual( [], seq_feature_dict[ 'predicted/region/label/index'].feature[1].int64_list.value[:]) self.assertAllClose( [], seq_feature_dict[ 'predicted/region/label/confidence'].feature[1].float_list.value[:])
def test_make_labeled_example_with_context_features(self): num_frames = 2 image_height = 100 image_width = 200 dataset_name = b'unlabeled_dataset' video_id = b'video_000' labels = [b'dog', b'cat'] images = tf.cast(tf.random.uniform( [num_frames, image_height, image_width, 3], maxval=256, dtype=tf.int32), dtype=tf.uint8) images_list = tf.unstack(images, axis=0) encoded_images_list = [tf.io.encode_jpeg(image) for image in images_list] encoded_images = self.materialize_tensors(encoded_images_list) timestamps = [100000, 110000] is_annotated = [1, 0] bboxes = [ np.array([[0., 0., 0., 0.], [0., 0., 1., 1.]], dtype=np.float32), np.zeros([0, 4], dtype=np.float32) ] label_strings = [ np.array(labels), np.array([]) ] context_features = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] context_feature_length = [3] context_features_image_id_list = [b'im_1', b'im_2'] seq_example = seq_example_util.make_sequence_example( dataset_name=dataset_name, video_id=video_id, encoded_images=encoded_images, image_height=image_height, image_width=image_width, timestamps=timestamps, is_annotated=is_annotated, bboxes=bboxes, label_strings=label_strings, context_features=context_features, context_feature_length=context_feature_length, context_features_image_id_list=context_features_image_id_list) context_feature_dict = seq_example.context.feature self.assertEqual( dataset_name, context_feature_dict['example/dataset_name'].bytes_list.value[0]) self.assertEqual( timestamps[0], context_feature_dict['clip/start/timestamp'].int64_list.value[0]) self.assertEqual( timestamps[-1], context_feature_dict['clip/end/timestamp'].int64_list.value[0]) self.assertEqual( num_frames, context_feature_dict['clip/frames'].int64_list.value[0]) self.assertAllClose( context_features, context_feature_dict['image/context_features'].float_list.value[:]) self.assertEqual( context_feature_length[0], context_feature_dict[ 'image/context_feature_length'].int64_list.value[0]) self.assertEqual( context_features_image_id_list, context_feature_dict[ 'image/context_features_image_id_list'].bytes_list.value[:]) seq_feature_dict = seq_example.feature_lists.feature_list self.assertLen( seq_feature_dict['image/encoded'].feature[:], num_frames) actual_timestamps = [ feature.int64_list.value[0] for feature in seq_feature_dict['image/timestamp'].feature] self.assertAllEqual(timestamps, actual_timestamps) # Frame 0. self.assertAllEqual( is_annotated[0], seq_feature_dict['region/is_annotated'].feature[0].int64_list.value[0]) self.assertAllClose( [0., 0.], seq_feature_dict['region/bbox/ymin'].feature[0].float_list.value[:]) self.assertAllClose( [0., 0.], seq_feature_dict['region/bbox/xmin'].feature[0].float_list.value[:]) self.assertAllClose( [0., 1.], seq_feature_dict['region/bbox/ymax'].feature[0].float_list.value[:]) self.assertAllClose( [0., 1.], seq_feature_dict['region/bbox/xmax'].feature[0].float_list.value[:]) self.assertAllEqual( labels, seq_feature_dict['region/label/string'].feature[0].bytes_list.value[:]) # Frame 1. self.assertAllEqual( is_annotated[1], seq_feature_dict['region/is_annotated'].feature[1].int64_list.value[0]) self.assertAllClose( [], seq_feature_dict['region/bbox/ymin'].feature[1].float_list.value[:]) self.assertAllClose( [], seq_feature_dict['region/bbox/xmin'].feature[1].float_list.value[:]) self.assertAllClose( [], seq_feature_dict['region/bbox/ymax'].feature[1].float_list.value[:]) self.assertAllClose( [], seq_feature_dict['region/bbox/xmax'].feature[1].float_list.value[:]) self.assertAllEqual( [], seq_feature_dict['region/label/string'].feature[1].bytes_list.value[:])
def _generate_sequence_examples(self, annotation_file, excluded_file, label_map, seconds_per_sequence, hop_between_sequences, video_path_format_string): """For each row in the annotation CSV, generates corresponding examples. When iterating through frames for a single sequence example, skips over excluded frames. When moving to the next sequence example, also skips over excluded frames as if they don't exist. Generates equal-length sequence examples, each with length seconds_per_sequence (1 fps) and gaps of hop_between_sequences frames (and seconds) between them, possible greater due to excluded frames. Args: annotation_file: path to the file of AVA CSV annotations. excluded_file: path to a CSV file of excluded timestamps for each video. label_map: an {int: string} label map. seconds_per_sequence: The number of seconds per example in each example. hop_between_sequences: The hop between sequences. If less than seconds_per_sequence, will overlap. video_path_format_string: File path format to glob video files. Yields: Each prepared tf.SequenceExample of metadata also containing video frames """ fieldnames = [ 'id', 'timestamp_seconds', 'xmin', 'ymin', 'xmax', 'ymax', 'action_label' ] frame_excluded = {} # create a sparse, nested map of videos and frame indices. with open(excluded_file, 'r') as excluded: reader = csv.reader(excluded) for row in reader: frame_excluded[(row[0], int(float(row[1])))] = True with open(annotation_file, 'r') as annotations: reader = csv.DictReader(annotations, fieldnames) frame_annotations = collections.defaultdict(list) ids = set() # aggreggate by video and timestamp: for row in reader: ids.add(row['id']) key = (row['id'], int(float(row['timestamp_seconds']))) frame_annotations[key].append(row) # for each video, find aggregates near each sampled frame.: logging.info('Generating metadata...') media_num = 1 for media_id in ids: logging.info('%d/%d, ignore warnings.\n', media_num, len(ids)) media_num += 1 filepath = glob.glob( video_path_format_string.format(media_id) + '*')[0] cur_vid = cv2.VideoCapture(filepath) width = cur_vid.get(cv2.CAP_PROP_FRAME_WIDTH) height = cur_vid.get(cv2.CAP_PROP_FRAME_HEIGHT) middle_frame_time = POSSIBLE_TIMESTAMPS[0] while middle_frame_time < POSSIBLE_TIMESTAMPS[-1]: start_time = middle_frame_time - seconds_per_sequence // 2 - ( 0 if seconds_per_sequence % 2 == 0 else 1) end_time = middle_frame_time + (seconds_per_sequence // 2) total_boxes = [] total_labels = [] total_label_strings = [] total_images = [] total_source_ids = [] total_confidences = [] total_is_annotated = [] windowed_timestamp = start_time while windowed_timestamp < end_time: if (media_id, windowed_timestamp) in frame_excluded: end_time += 1 windowed_timestamp += 1 logging.info( 'Ignoring and skipping excluded frame.') continue cur_vid.set(cv2.CAP_PROP_POS_MSEC, (windowed_timestamp) * SECONDS_TO_MILLI) _, image = cur_vid.read() _, buffer = cv2.imencode('.jpg', image) bufstring = buffer.tostring() total_images.append(bufstring) source_id = str(windowed_timestamp) + '_' + media_id total_source_ids.append(source_id) total_is_annotated.append(1) boxes = [] labels = [] label_strings = [] confidences = [] for row in frame_annotations[(media_id, windowed_timestamp)]: if len(row) > 2 and int( row['action_label']) in label_map: boxes.append([ float(row['ymin']), float(row['xmin']), float(row['ymax']), float(row['xmax']) ]) labels.append(int(row['action_label'])) label_strings.append(label_map[int( row['action_label'])]) confidences.append(1) else: logging.warning('Unknown label: %s', row['action_label']) total_boxes.append(boxes) total_labels.append(labels) total_label_strings.append(label_strings) total_confidences.append(confidences) windowed_timestamp += 1 if total_boxes: yield seq_example_util.make_sequence_example( 'AVA', media_id, total_images, int(height), int(width), 'jpeg', total_source_ids, None, total_is_annotated, total_boxes, total_label_strings, use_strs_for_source_id=True) # Move middle_time_frame, skipping excluded frames frames_mv = 0 frames_excluded_count = 0 while (frames_mv < hop_between_sequences + frames_excluded_count and middle_frame_time + frames_mv < POSSIBLE_TIMESTAMPS[-1]): frames_mv += 1 if (media_id, windowed_timestamp + frames_mv) in frame_excluded: frames_excluded_count += 1 middle_frame_time += frames_mv cur_vid.release()