def create_video_speech_operation_from_model( self, storage_uri, *args, **kwargs): # i.e. copied and pasted from Google """Transcribe speech from a video stored on GCS.""" from google.cloud import videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.SPEECH_TRANSCRIPTION] config = videointelligence.SpeechTranscriptionConfig( language_code="en-US", enable_automatic_punctuation=True, #enable_word_time_offsets=True, max_alternatives=2) video_context = videointelligence.VideoContext( speech_transcription_config=config) operation = video_client.annotate_video( request={ "features": features, "input_uri": storage_uri, "video_context": video_context, }) # Use confirmation if self.require_api_confirmation: confirmation = input( f"Really recognize speech in {storage_uri}? (Y/n) ") if confirmation.lower() != "y": raise Exception("Did not agree to recognize speech") print("\nProcessing video for speech transcription.") return operation
def detect_labels(video_client, file_handle, input_uri, l, t): EXCLUDE = ["nature", "aerial photography", "tree"] print("{} spawned".format(t)) features = [videointelligence.Feature.LABEL_DETECTION] s = [] for j in range(10): s.append(videointelligence.VideoSegment(start_time_offset=Duration(seconds=0+j*5+50*t), end_time_offset=Duration(seconds=(j+1)*5 + 50*t))) if (j+1)*5 + 50*t >= l: break print("{} {} segments: ".format(t, len(s))) operation = video_client.annotate_video( request={ "features": features, "input_uri": input_uri, "video_context": videointelligence.VideoContext(segments=s) } ) result = operation.result(timeout=120) print("\nFinished processing thread {}.".format(t)) # segment_labels = result.annotation_results[0].segment_label_annotations for x in result.annotation_results: segment_labels = x.segment_label_annotations for i, segment_label in enumerate(segment_labels): if segment_label.entity.description in EXCLUDE: continue print("Video label description: {}".format(segment_label.entity.description)) category_desc = "" for category_entity in segment_label.category_entities: print( "\tLabel category description: {}".format(category_entity.description) ) for i, segment in enumerate(segment_label.segments): start_time = ( segment.segment.start_time_offset.seconds + segment.segment.start_time_offset.microseconds / 1e6 ) end_time = ( segment.segment.end_time_offset.seconds + segment.segment.end_time_offset.microseconds / 1e6 ) # positions = "{}s to {}s".format(start_time, end_time) # confidence = segment.confidence # print("\tSegment {}: {}".format(i, positions)) # print("\tConfidence: {}".format(confidence)) file_handle.write("{},{},{},{}\n".format(segment_label.entity.description, str(start_time), str(end_time), str(segment.confidence))) return None
def analyze_video_with_intelligence_detailed(self, infile: str) -> dict: """Send infile to Vision API, return dict with all labels, objects and texts.""" logging.debug("Enter vision API") _tags = {} """Detect text in a local video.""" video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.TEXT_DETECTION] video_context = videointelligence.VideoContext() with io.open(infile, "rb") as file: input_content = file.read() operation = video_client.annotate_video( request={ "features": features, "input_content": input_content, "video_context": video_context, }) logging.debug("\nProcessing video for text detection.") result = operation.result(timeout=300) # The first result is retrieved because a single video was processed. annotation_result = result.annotation_results[0] _texts = "" for text_annotation in annotation_result.text_annotations: logging.debug("\nText: {}".format(text_annotation.text)) # Get the first text segment text_segment = text_annotation.segments[0] start_time = text_segment.segment.start_time_offset end_time = text_segment.segment.end_time_offset logging.debug("start_time: {}, end_time: {}".format( start_time.seconds + start_time.microseconds * 1e-6, end_time.seconds + end_time.microseconds * 1e-6, )) logging.debug("Confidence: {}".format(text_segment.confidence)) # Show the result for the first frame in this segment. frame = text_segment.frames[0] time_offset = frame.time_offset logging.debug("Time offset for the first frame: {}".format( time_offset.seconds + time_offset.microseconds * 1e-6)) _texts = _texts + text_annotation.text + ";" _tags["Texts"] = _texts return _tags
def speech_transcription(path): # [START video_speech_transcription_gcs] """Transcribe speech from a video stored on GCS.""" from google.cloud import videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.SPEECH_TRANSCRIPTION] config = videointelligence.SpeechTranscriptionConfig( language_code="en-US", enable_automatic_punctuation=True) video_context = videointelligence.VideoContext( speech_transcription_config=config) operation = video_client.annotate_video(request={ "features": features, "input_uri": path, "video_context": video_context, }) print("\nProcessing video for speech transcription.") result = operation.result(timeout=600) # There is only one annotation_result since only # one video is processed. annotation_results = result.annotation_results[0] for speech_transcription in annotation_results.speech_transcriptions: # The number of alternatives for each transcription is limited by # SpeechTranscriptionConfig.max_alternatives. # Each alternative is a different possible transcription # and has its own confidence score. for alternative in speech_transcription.alternatives: print("Alternative level information:") print("Transcript: {}".format(alternative.transcript)) print("Confidence: {}\n".format(alternative.confidence)) print("Word level information:") for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time print("\t{}s - {}s: {}".format( start_time.seconds + start_time.microseconds * 1e-6, end_time.seconds + end_time.microseconds * 1e-6, word, ))
def video_detect_text(path): # [START video_detect_text] """Detect text in a local video.""" from google.cloud import videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.TEXT_DETECTION] video_context = videointelligence.VideoContext() with io.open(path, "rb") as file: input_content = file.read() operation = video_client.annotate_video( request={ "features": features, "input_content": input_content, "video_context": video_context, }) print("\nProcessing video for text detection.") result = operation.result(timeout=300) # The first result is retrieved because a single video was processed. annotation_result = result.annotation_results[0] for text_annotation in annotation_result.text_annotations: print("\nText: {}".format(text_annotation.text)) # Get the first text segment text_segment = text_annotation.segments[0] start_time = text_segment.segment.start_time_offset end_time = text_segment.segment.end_time_offset print("start_time: {}, end_time: {}".format( start_time.seconds + start_time.microseconds * 1e-6, end_time.seconds + end_time.microseconds * 1e-6, )) print("Confidence: {}".format(text_segment.confidence)) # Show the result for the first frame in this segment. frame = text_segment.frames[0] time_offset = frame.time_offset print("Time offset for the first frame: {}".format( time_offset.seconds + time_offset.microseconds * 1e-6)) print("Rotated Bounding Box Vertices:") for vertex in frame.rotated_bounding_box.vertices: print("\tVertex.x: {}, Vertex.y: {}".format(vertex.x, vertex.y))
"""Detect text in a local video.""" import io from google.cloud import videointelligence video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.TEXT_DETECTION] video_context = videointelligence.VideoContext() path = '../TK/grzyby.avi' with io.open(path, "rb") as file: input_content = file.read() operation = video_client.annotate_video( request={ "features": features, "input_content": input_content, "video_context": video_context, }) print("\nProcessing video for text detection.") result = operation.result(timeout=300) # The first result is retrieved because a single video was processed. annotation_result = result.annotation_results[0] for text_annotation in annotation_result.text_annotations: print("\nText: {}".format(text_annotation.text)) # Get the first text segment text_segment = text_annotation.segments[0]
def annotate_video( self, video_file: str = None, video_uri: str = None, features: str = None, output_uri: str = None, json_file: str = None, timeout: int = 300, ) -> Dict: """Annotate video Possible values for features: - FEATURE_UNSPECIFIED, Unspecified. - LABEL_DETECTION, Label detection. Detect objects, such as dog or flower. - SHOT_CHANGE_DETECTION, Shot change detection. - EXPLICIT_CONTENT_DETECTION, Explicit content detection. - SPEECH_TRANSCRIPTION, Speech transcription. - TEXT_DETECTION, OCR text detection and tracking. - OBJECT_TRACKING, Object detection and tracking. - LOGO_RECOGNITION, Logo detection, tracking, and recognition. If `video_uri` is given then that is used even if `video_file` is given. :param video_file: local file path to input video :param video_uri: Google Cloud Storage URI to input video :param features: list of annotation features to detect, defaults to LABEL_DETECTION,SHOT_CHANGE_DETECTION :param output_uri: Google Cloud Storage URI to store response json :param json_file: json target to save result :param timeout: timeout for operation in seconds :return: annotate result **Examples** **Robot Framework** .. code-block:: robotframework ${result}= Annotate Video video_uri=gs://videointelligence/movie.mp4 ... features=TEXT_DETECTION,LABEL_DETECTION ... output_uri=gs://videointelligence/movie_annotations.json ... json_file=${CURDIR}${/}videoannotations.json """ if features is None: features_in = [ videointelligence.Feature.LABEL_DETECTION, videointelligence.Feature.SHOT_CHANGE_DETECTION, ] else: features_in = [to_feature(feature) for feature in features.split(",")] parameters = {"features": features_in} if video_uri: parameters["input_uri"] = video_uri elif video_file: video_context = videointelligence.VideoContext() with open(video_file, "rb") as file: input_content = file.read() parameters["input_content"] = input_content parameters["video_context"] = video_context if output_uri: parameters["output_uri"] = output_uri operation = self.service.annotate_video(request=parameters) result = operation.result(timeout=timeout) self.write_json(json_file, result) return result
def analyze_labels(path): # [START video_analyze_labels_gcs] """ Detects labels given a GCS path. """ video_client = videointelligence.VideoIntelligenceServiceClient() features = [videointelligence.Feature.LABEL_DETECTION] mode = videointelligence.LabelDetectionMode.SHOT_AND_FRAME_MODE config = videointelligence.LabelDetectionConfig(label_detection_mode=mode) context = videointelligence.VideoContext(label_detection_config=config) operation = video_client.annotate_video(request={ "features": features, "input_uri": path, "video_context": context }) print("\nProcessing video for label annotations:") result = operation.result(timeout=180) print("\nFinished processing.") # Process video/segment level label annotations segment_labels = result.annotation_results[0].segment_label_annotations for i, segment_label in enumerate(segment_labels): print("Video label description: {}".format( segment_label.entity.description)) for category_entity in segment_label.category_entities: print("\tLabel category description: {}".format( category_entity.description)) for i, segment in enumerate(segment_label.segments): start_time = (segment.segment.start_time_offset.seconds + segment.segment.start_time_offset.microseconds / 1e6) end_time = (segment.segment.end_time_offset.seconds + segment.segment.end_time_offset.microseconds / 1e6) positions = "{}s to {}s".format(start_time, end_time) confidence = segment.confidence print("\tSegment {}: {}".format(i, positions)) print("\tConfidence: {}".format(confidence)) print("\n") # Process shot level label annotations shot_labels = result.annotation_results[0].shot_label_annotations for i, shot_label in enumerate(shot_labels): print("Shot label description: {}".format( shot_label.entity.description)) for category_entity in shot_label.category_entities: print("\tLabel category description: {}".format( category_entity.description)) for i, shot in enumerate(shot_label.segments): start_time = (shot.segment.start_time_offset.seconds + shot.segment.start_time_offset.microseconds / 1e6) end_time = (shot.segment.end_time_offset.seconds + shot.segment.end_time_offset.microseconds / 1e6) positions = "{}s to {}s".format(start_time, end_time) confidence = shot.confidence print("\tSegment {}: {}".format(i, positions)) print("\tConfidence: {}".format(confidence)) print("\n") # Process frame level label annotations frame_labels = result.annotation_results[0].frame_label_annotations for i, frame_label in enumerate(frame_labels): print("Frame label description: {}".format( frame_label.entity.description)) for category_entity in frame_label.category_entities: print("\tLabel category description: {}".format( category_entity.description)) # Each frame_label_annotation has many frames, # here we print information only about the first frame. frame = frame_label.frames[0] time_offset = frame.time_offset.seconds + frame.time_offset.microseconds / 1e6 print("\tFirst frame time offset: {}s".format(time_offset)) print("\tFirst frame confidence: {}".format(frame.confidence)) print("\n")