Esempio n. 1
0
    def _generate_metadata(self, annotations_file, video_dir):
        """For each row in the annotation CSV, generates the corresponding metadata.

    Args:
      annotations_file: path to the file of Charades CSV annotations.
      video_dir: path to the directory of video files referenced by the
        annotations.
    Yields:
      Each tf.SequenceExample of metadata, ready to pass to MediaPipe.
    """
        with open(annotations_file, "r") as annotations:
            reader = csv.DictReader(annotations)
            for row in reader:
                metadata = tf.train.SequenceExample()
                filepath = os.path.join(video_dir, "%s.mp4" % row["id"])
                actions = row["actions"].split(";")
                action_indices = []
                action_strings = []
                action_start_times = []
                action_end_times = []
                for action in actions:
                    if not action:
                        continue
                    string, start, end = action.split(" ")
                    action_indices.append(int(string[1:]))
                    action_strings.append(bytes23(string))
                    action_start_times.append(
                        int(float(start) * SECONDS_TO_MICROSECONDS))
                    action_end_times.append(
                        int(float(end) * SECONDS_TO_MICROSECONDS))
                ms.set_example_id(bytes23(row["id"]), metadata)
                ms.set_clip_data_path(bytes23(filepath), metadata)
                ms.set_clip_start_timestamp(0, metadata)
                ms.set_clip_end_timestamp(
                    int(float(row["length"]) * SECONDS_TO_MICROSECONDS),
                    metadata)
                ms.set_segment_start_timestamp(action_start_times, metadata)
                ms.set_segment_end_timestamp(action_end_times, metadata)
                ms.set_segment_label_string(action_strings, metadata)
                ms.set_segment_label_index(action_indices, metadata)
                yield metadata
 def test_expected_functions_are_defined(self):
     # The code from media_sequence_util is already tested, but this test ensures
     # that we actually generate the expected methods. We only test one per
     # feature and the only test is to not crash with undefined attributes. By
     # passing in a value, we also ensure that the types are correct because the
     # underlying code crashes with a type mismatch.
     example = tf.train.SequenceExample()
     # context
     ms.set_example_id(b"string", example)
     ms.set_example_dataset_name(b"string", example)
     ms.set_clip_media_id(b"string", example)
     ms.set_clip_alternative_media_id(b"string", example)
     ms.set_clip_encoded_media_bytes(b"string", example)
     ms.set_clip_encoded_media_start_timestamp(47, example)
     ms.set_clip_data_path(b"string", example)
     ms.set_clip_start_timestamp(47, example)
     ms.set_clip_end_timestamp(47, example)
     ms.set_clip_label_string((b"string", b"test"), example)
     ms.set_clip_label_index((47, 49), example)
     ms.set_clip_label_confidence((0.47, 0.49), example)
     ms.set_segment_start_timestamp((47, 49), example)
     ms.set_segment_start_index((47, 49), example)
     ms.set_segment_end_timestamp((47, 49), example)
     ms.set_segment_end_index((47, 49), example)
     ms.set_segment_label_index((47, 49), example)
     ms.set_segment_label_string((b"test", b"strings"), example)
     ms.set_segment_label_confidence((0.47, 0.49), example)
     ms.set_image_format(b"test", example)
     ms.set_image_channels(47, example)
     ms.set_image_colorspace(b"test", example)
     ms.set_image_height(47, example)
     ms.set_image_width(47, example)
     ms.set_image_frame_rate(0.47, example)
     ms.set_image_data_path(b"test", example)
     ms.set_forward_flow_format(b"test", example)
     ms.set_forward_flow_channels(47, example)
     ms.set_forward_flow_colorspace(b"test", example)
     ms.set_forward_flow_height(47, example)
     ms.set_forward_flow_width(47, example)
     ms.set_forward_flow_frame_rate(0.47, example)
     ms.set_class_segmentation_format(b"test", example)
     ms.set_class_segmentation_height(47, example)
     ms.set_class_segmentation_width(47, example)
     ms.set_class_segmentation_class_label_string((b"test", b"strings"),
                                                  example)
     ms.set_class_segmentation_class_label_index((47, 49), example)
     ms.set_instance_segmentation_format(b"test", example)
     ms.set_instance_segmentation_height(47, example)
     ms.set_instance_segmentation_width(47, example)
     ms.set_instance_segmentation_object_class_index((47, 49), example)
     ms.set_bbox_parts((b"HEAD", b"TOE"), example)
     # feature lists
     ms.add_image_encoded(b"test", example)
     ms.add_image_multi_encoded([b"test", b"test"], example)
     ms.add_image_timestamp(47, example)
     ms.add_forward_flow_encoded(b"test", example)
     ms.add_forward_flow_multi_encoded([b"test", b"test"], example)
     ms.add_forward_flow_timestamp(47, example)
     ms.add_bbox_ymin((0.47, 0.49), example)
     ms.add_bbox_xmin((0.47, 0.49), example)
     ms.add_bbox_ymax((0.47, 0.49), example)
     ms.add_bbox_xmax((0.47, 0.49), example)
     ms.add_bbox_point_x((0.47, 0.49), example)
     ms.add_bbox_point_y((0.47, 0.49), example)
     ms.add_predicted_bbox_ymin((0.47, 0.49), example)
     ms.add_predicted_bbox_xmin((0.47, 0.49), example)
     ms.add_predicted_bbox_ymax((0.47, 0.49), example)
     ms.add_predicted_bbox_xmax((0.47, 0.49), example)
     ms.add_bbox_num_regions(47, example)
     ms.add_bbox_is_annotated(47, example)
     ms.add_bbox_is_generated((47, 49), example)
     ms.add_bbox_is_occluded((47, 49), example)
     ms.add_bbox_label_index((47, 49), example)
     ms.add_bbox_label_string((b"test", b"strings"), example)
     ms.add_bbox_label_confidence((0.47, 0.49), example)
     ms.add_bbox_class_index((47, 49), example)
     ms.add_bbox_class_string((b"test", b"strings"), example)
     ms.add_bbox_class_confidence((0.47, 0.49), example)
     ms.add_bbox_track_index((47, 49), example)
     ms.add_bbox_track_string((b"test", b"strings"), example)
     ms.add_bbox_track_confidence((0.47, 0.49), example)
     ms.add_bbox_timestamp(47, example)
     ms.add_predicted_bbox_class_index((47, 49), example)
     ms.add_predicted_bbox_class_string((b"test", b"strings"), example)
     ms.add_predicted_bbox_timestamp(47, example)
     ms.add_class_segmentation_encoded(b"test", example)
     ms.add_class_segmentation_multi_encoded([b"test", b"test"], example)
     ms.add_instance_segmentation_encoded(b"test", example)
     ms.add_instance_segmentation_multi_encoded([b"test", b"test"], example)
     ms.add_class_segmentation_timestamp(47, example)
     ms.set_bbox_embedding_dimensions_per_region((47, 49), example)
     ms.set_bbox_embedding_format(b"test", example)
     ms.add_bbox_embedding_floats((0.47, 0.49), example)
     ms.add_bbox_embedding_encoded((b"text", b"stings"), example)
     ms.add_bbox_embedding_confidence((0.47, 0.49), example)