Esempio n. 1
0
    def create_video_speech_operation_from_model(
            self, storage_uri, *args,
            **kwargs):  # i.e. copied and pasted from Google
        """Transcribe speech from a video stored on GCS."""
        from google.cloud import videointelligence
        video_client = videointelligence.VideoIntelligenceServiceClient()
        features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]

        config = videointelligence.SpeechTranscriptionConfig(
            language_code="en-US",
            enable_automatic_punctuation=True,
            #enable_word_time_offsets=True,
            max_alternatives=2)
        video_context = videointelligence.VideoContext(
            speech_transcription_config=config)
        operation = video_client.annotate_video(
            request={
                "features": features,
                "input_uri": storage_uri,
                "video_context": video_context,
            })

        # Use confirmation
        if self.require_api_confirmation:
            confirmation = input(
                f"Really recognize speech in {storage_uri}? (Y/n) ")
            if confirmation.lower() != "y":
                raise Exception("Did not agree to recognize speech")

        print("\nProcessing video for speech transcription.")
        return operation
Esempio n. 2
0
def detect_labels(video_client, file_handle, input_uri, l, t):
    EXCLUDE = ["nature", "aerial photography", "tree"]
    print("{} spawned".format(t))
    features = [videointelligence.Feature.LABEL_DETECTION]
    s = []
    for j in range(10):
        s.append(videointelligence.VideoSegment(start_time_offset=Duration(seconds=0+j*5+50*t), end_time_offset=Duration(seconds=(j+1)*5 + 50*t)))
        if (j+1)*5 + 50*t >= l:
            break

    print("{} {} segments: ".format(t, len(s)))

    operation = video_client.annotate_video(
        request={
            "features": features,
            "input_uri": input_uri,
            "video_context": videointelligence.VideoContext(segments=s)
        }
    )
    result = operation.result(timeout=120)

    print("\nFinished processing thread {}.".format(t))

    # segment_labels = result.annotation_results[0].segment_label_annotations

    for x in result.annotation_results:
        segment_labels = x.segment_label_annotations
        for i, segment_label in enumerate(segment_labels):
            if segment_label.entity.description in EXCLUDE:
                continue

            print("Video label description: {}".format(segment_label.entity.description))
            category_desc = ""
            for category_entity in segment_label.category_entities:
                print(
                    "\tLabel category description: {}".format(category_entity.description)
                )

            for i, segment in enumerate(segment_label.segments):
                start_time = (
                    segment.segment.start_time_offset.seconds
                    + segment.segment.start_time_offset.microseconds / 1e6
                )
                end_time = (
                    segment.segment.end_time_offset.seconds
                    + segment.segment.end_time_offset.microseconds / 1e6
                )
                # positions = "{}s to {}s".format(start_time, end_time)
                # confidence = segment.confidence
                # print("\tSegment {}: {}".format(i, positions))
                # print("\tConfidence: {}".format(confidence))

                file_handle.write("{},{},{},{}\n".format(segment_label.entity.description, str(start_time), str(end_time), str(segment.confidence)))

    return None
    def analyze_video_with_intelligence_detailed(self, infile: str) -> dict:
        """Send infile to Vision API, return dict with all labels, objects and texts."""
        logging.debug("Enter vision API")
        _tags = {}
        """Detect text in a local video."""
        video_client = videointelligence.VideoIntelligenceServiceClient()
        features = [videointelligence.Feature.TEXT_DETECTION]
        video_context = videointelligence.VideoContext()

        with io.open(infile, "rb") as file:
            input_content = file.read()

        operation = video_client.annotate_video(
            request={
                "features": features,
                "input_content": input_content,
                "video_context": video_context,
            })

        logging.debug("\nProcessing video for text detection.")
        result = operation.result(timeout=300)

        # The first result is retrieved because a single video was processed.
        annotation_result = result.annotation_results[0]
        _texts = ""

        for text_annotation in annotation_result.text_annotations:
            logging.debug("\nText: {}".format(text_annotation.text))

            # Get the first text segment
            text_segment = text_annotation.segments[0]

            start_time = text_segment.segment.start_time_offset
            end_time = text_segment.segment.end_time_offset
            logging.debug("start_time: {}, end_time: {}".format(
                start_time.seconds + start_time.microseconds * 1e-6,
                end_time.seconds + end_time.microseconds * 1e-6,
            ))

            logging.debug("Confidence: {}".format(text_segment.confidence))

            # Show the result for the first frame in this segment.
            frame = text_segment.frames[0]
            time_offset = frame.time_offset
            logging.debug("Time offset for the first frame: {}".format(
                time_offset.seconds + time_offset.microseconds * 1e-6))
            _texts = _texts + text_annotation.text + ";"

        _tags["Texts"] = _texts
        return _tags
Esempio n. 4
0
def speech_transcription(path):
    # [START video_speech_transcription_gcs]
    """Transcribe speech from a video stored on GCS."""
    from google.cloud import videointelligence

    video_client = videointelligence.VideoIntelligenceServiceClient()
    features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]

    config = videointelligence.SpeechTranscriptionConfig(
        language_code="en-US", enable_automatic_punctuation=True)
    video_context = videointelligence.VideoContext(
        speech_transcription_config=config)

    operation = video_client.annotate_video(request={
        "features": features,
        "input_uri": path,
        "video_context": video_context,
    })

    print("\nProcessing video for speech transcription.")

    result = operation.result(timeout=600)

    # There is only one annotation_result since only
    # one video is processed.
    annotation_results = result.annotation_results[0]
    for speech_transcription in annotation_results.speech_transcriptions:

        # The number of alternatives for each transcription is limited by
        # SpeechTranscriptionConfig.max_alternatives.
        # Each alternative is a different possible transcription
        # and has its own confidence score.
        for alternative in speech_transcription.alternatives:
            print("Alternative level information:")

            print("Transcript: {}".format(alternative.transcript))
            print("Confidence: {}\n".format(alternative.confidence))

            print("Word level information:")
            for word_info in alternative.words:
                word = word_info.word
                start_time = word_info.start_time
                end_time = word_info.end_time
                print("\t{}s - {}s: {}".format(
                    start_time.seconds + start_time.microseconds * 1e-6,
                    end_time.seconds + end_time.microseconds * 1e-6,
                    word,
                ))
Esempio n. 5
0
def video_detect_text(path):
    # [START video_detect_text]
    """Detect text in a local video."""
    from google.cloud import videointelligence

    video_client = videointelligence.VideoIntelligenceServiceClient()
    features = [videointelligence.Feature.TEXT_DETECTION]
    video_context = videointelligence.VideoContext()

    with io.open(path, "rb") as file:
        input_content = file.read()

    operation = video_client.annotate_video(
        request={
            "features": features,
            "input_content": input_content,
            "video_context": video_context,
        })

    print("\nProcessing video for text detection.")
    result = operation.result(timeout=300)

    # The first result is retrieved because a single video was processed.
    annotation_result = result.annotation_results[0]

    for text_annotation in annotation_result.text_annotations:
        print("\nText: {}".format(text_annotation.text))

        # Get the first text segment
        text_segment = text_annotation.segments[0]
        start_time = text_segment.segment.start_time_offset
        end_time = text_segment.segment.end_time_offset
        print("start_time: {}, end_time: {}".format(
            start_time.seconds + start_time.microseconds * 1e-6,
            end_time.seconds + end_time.microseconds * 1e-6,
        ))

        print("Confidence: {}".format(text_segment.confidence))

        # Show the result for the first frame in this segment.
        frame = text_segment.frames[0]
        time_offset = frame.time_offset
        print("Time offset for the first frame: {}".format(
            time_offset.seconds + time_offset.microseconds * 1e-6))
        print("Rotated Bounding Box Vertices:")
        for vertex in frame.rotated_bounding_box.vertices:
            print("\tVertex.x: {}, Vertex.y: {}".format(vertex.x, vertex.y))
Esempio n. 6
0
"""Detect text in a local video."""
import io

from google.cloud import videointelligence

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.TEXT_DETECTION]
video_context = videointelligence.VideoContext()
path = '../TK/grzyby.avi'

with io.open(path, "rb") as file:
    input_content = file.read()

operation = video_client.annotate_video(
    request={
        "features": features,
        "input_content": input_content,
        "video_context": video_context,
    })

print("\nProcessing video for text detection.")
result = operation.result(timeout=300)

# The first result is retrieved because a single video was processed.
annotation_result = result.annotation_results[0]

for text_annotation in annotation_result.text_annotations:
    print("\nText: {}".format(text_annotation.text))

    # Get the first text segment
    text_segment = text_annotation.segments[0]
Esempio n. 7
0
    def annotate_video(
        self,
        video_file: str = None,
        video_uri: str = None,
        features: str = None,
        output_uri: str = None,
        json_file: str = None,
        timeout: int = 300,
    ) -> Dict:
        """Annotate video

        Possible values for features:

        - FEATURE_UNSPECIFIED, Unspecified.
        - LABEL_DETECTION, Label detection. Detect objects, such as dog or flower.
        - SHOT_CHANGE_DETECTION, Shot change detection.
        - EXPLICIT_CONTENT_DETECTION, Explicit content detection.
        - SPEECH_TRANSCRIPTION, Speech transcription.
        - TEXT_DETECTION, OCR text detection and tracking.
        - OBJECT_TRACKING, Object detection and tracking.
        - LOGO_RECOGNITION, Logo detection, tracking, and recognition.

        If `video_uri` is given then that is used even if `video_file` is given.

        :param video_file: local file path to input video
        :param video_uri: Google Cloud Storage URI to input video
        :param features: list of annotation features to detect,
            defaults to LABEL_DETECTION,SHOT_CHANGE_DETECTION
        :param output_uri: Google Cloud Storage URI to store response json
        :param json_file: json target to save result
        :param timeout: timeout for operation in seconds
        :return: annotate result

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ${result}=   Annotate Video   video_uri=gs://videointelligence/movie.mp4
            ...  features=TEXT_DETECTION,LABEL_DETECTION
            ...  output_uri=gs://videointelligence/movie_annotations.json
            ...  json_file=${CURDIR}${/}videoannotations.json
        """
        if features is None:
            features_in = [
                videointelligence.Feature.LABEL_DETECTION,
                videointelligence.Feature.SHOT_CHANGE_DETECTION,
            ]
        else:
            features_in = [to_feature(feature) for feature in features.split(",")]
        parameters = {"features": features_in}
        if video_uri:
            parameters["input_uri"] = video_uri
        elif video_file:
            video_context = videointelligence.VideoContext()
            with open(video_file, "rb") as file:
                input_content = file.read()
                parameters["input_content"] = input_content
                parameters["video_context"] = video_context
        if output_uri:
            parameters["output_uri"] = output_uri

        operation = self.service.annotate_video(request=parameters)
        result = operation.result(timeout=timeout)
        self.write_json(json_file, result)
        return result
Esempio n. 8
0
def analyze_labels(path):
    # [START video_analyze_labels_gcs]
    """ Detects labels given a GCS path. """
    video_client = videointelligence.VideoIntelligenceServiceClient()
    features = [videointelligence.Feature.LABEL_DETECTION]

    mode = videointelligence.LabelDetectionMode.SHOT_AND_FRAME_MODE
    config = videointelligence.LabelDetectionConfig(label_detection_mode=mode)
    context = videointelligence.VideoContext(label_detection_config=config)

    operation = video_client.annotate_video(request={
        "features": features,
        "input_uri": path,
        "video_context": context
    })
    print("\nProcessing video for label annotations:")

    result = operation.result(timeout=180)
    print("\nFinished processing.")

    # Process video/segment level label annotations
    segment_labels = result.annotation_results[0].segment_label_annotations
    for i, segment_label in enumerate(segment_labels):
        print("Video label description: {}".format(
            segment_label.entity.description))
        for category_entity in segment_label.category_entities:
            print("\tLabel category description: {}".format(
                category_entity.description))

        for i, segment in enumerate(segment_label.segments):
            start_time = (segment.segment.start_time_offset.seconds +
                          segment.segment.start_time_offset.microseconds / 1e6)
            end_time = (segment.segment.end_time_offset.seconds +
                        segment.segment.end_time_offset.microseconds / 1e6)
            positions = "{}s to {}s".format(start_time, end_time)
            confidence = segment.confidence
            print("\tSegment {}: {}".format(i, positions))
            print("\tConfidence: {}".format(confidence))
        print("\n")

    # Process shot level label annotations
    shot_labels = result.annotation_results[0].shot_label_annotations
    for i, shot_label in enumerate(shot_labels):
        print("Shot label description: {}".format(
            shot_label.entity.description))
        for category_entity in shot_label.category_entities:
            print("\tLabel category description: {}".format(
                category_entity.description))

        for i, shot in enumerate(shot_label.segments):
            start_time = (shot.segment.start_time_offset.seconds +
                          shot.segment.start_time_offset.microseconds / 1e6)
            end_time = (shot.segment.end_time_offset.seconds +
                        shot.segment.end_time_offset.microseconds / 1e6)
            positions = "{}s to {}s".format(start_time, end_time)
            confidence = shot.confidence
            print("\tSegment {}: {}".format(i, positions))
            print("\tConfidence: {}".format(confidence))
        print("\n")

    # Process frame level label annotations
    frame_labels = result.annotation_results[0].frame_label_annotations
    for i, frame_label in enumerate(frame_labels):
        print("Frame label description: {}".format(
            frame_label.entity.description))
        for category_entity in frame_label.category_entities:
            print("\tLabel category description: {}".format(
                category_entity.description))

        # Each frame_label_annotation has many frames,
        # here we print information only about the first frame.
        frame = frame_label.frames[0]
        time_offset = frame.time_offset.seconds + frame.time_offset.microseconds / 1e6
        print("\tFirst frame time offset: {}s".format(time_offset))
        print("\tFirst frame confidence: {}".format(frame.confidence))
        print("\n")