def transcribe_file(speech_file, num_speakers):
    """Transcribe the given audio file asynchronously."""

    # Imports the Google Cloud client library
    #from google.cloud import speech
    from google.cloud import speech_v1p1beta1 as speech


    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw")
    
    # Loads the audio into memory
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()
    
    # Construct a recognition metadata object
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE
    )
    metadata.audio_topic = "court trial hearing" 
    metadata.original_mime_type = "audio/mp3"

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=num_speakers,
        # Enhanced models cost more than standard models. 
        use_enhanced=True,
        model="video",
        enable_word_time_offsets=True,

        )

    # Detects speech in the audio file -- short audio file
    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)
    result = response.results[-1]

    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(
            u"word: '{}', speaker_tag: {}, start_time:{}, end_time:{}".format(word_info.word, word_info.speaker_tag, word_info.start_time.total_seconds(), word_info.end_time.total_seconds())
        )
Exemple #2
0
def transcribe_file_with_metadata():
    """Send a request that includes recognition metadata."""
    # [START speech_transcribe_recognition_metadata_beta]
    from google.cloud import speech_v1p1beta1 as speech

    client = speech.SpeechClient()

    speech_file = "resources/commercial_mono.wav"

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    # Here we construct a recognition metadata object.
    # Most metadata fields are specified as enums that can be found
    # in speech.enums.RecognitionMetadata
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.microphone_distance = (
        speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD
    )
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE
    )

    # Some metadata fields are free form strings
    metadata.recording_device_name = "Pixel 2 XL"
    # And some are integers, for instance the 6 digit NAICS code
    # https://www.naics.com/search/
    metadata.industry_naics_code_of_audio = 519190

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        # Add this in the request to send metadata.
        metadata=metadata,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print(u"First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
Exemple #3
0
def initialize_metadata():
    """INITIALIZES THE METADATA AND ADDS IT TO THE CONFIG FILE"""
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.microphone_distance = (
        speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD
    )
    metadata.recording_device_type = (
    speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE
    )

    # Some metadata fields are free form strings
    metadata.recording_device_name = "Pixel 2 XL"
    # NAICS code represents the type of industrial place where the
    # recording is going to happen. In our case, the code 452311
    # represents warehouses that deal with general merchandise and food.
    # All the codes can be found at:
    # https://www.naics.com/search/
    metadata.industry_naics_code_of_audio = 452311
    
    return metadata  
Exemple #4
0
    def transcribe(
        self,
        file_uri: Union[str, Path],
        phrases: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> transcript_model.Transcript:
        """
        Transcribe audio from GCS file and return a Transcript model.

        Parameters
        ----------
        file_uri: Union[str, Path]
            The GCS file uri to the audio file or caption file to transcribe.
            It should be in format 'gs://...'.
        phrases: Optional[List[str]] = None
            A list of strings to feed as targets to the model.

        Returns
        -------
        outputs: transcript_model.Transcript
            The transcript model for the supplied media file.
        """
        # Create client
        client = speech.SpeechClient.from_service_account_file(
            filename=str(self.credentials_file))

        # Create basic metadata
        metadata = speech.RecognitionMetadata()
        metadata.interaction_type = (
            speech.RecognitionMetadata.InteractionType.PHONE_CALL)
        metadata.original_media_type = (
            speech.RecognitionMetadata.OriginalMediaType.VIDEO)

        # Add phrases
        event_metadata_speech_context = speech.SpeechContext(
            phrases=self._clean_phrases(phrases))

        # Prepare for transcription
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
            enable_spoken_punctuation=True,
            speech_contexts=[
                GOOGLE_SPEECH_ADAPTION_CLASSES,
                event_metadata_speech_context,
            ],
            metadata=metadata,
            model="video",
            use_enhanced=True,
        )
        audio = speech.RecognitionAudio(uri=file_uri)

        # Begin transcription
        log.debug(f"Beginning transcription for: {file_uri}")
        operation = client.long_running_recognize(request={
            "config": config,
            "audio": audio
        })

        # Wait for complete
        response = operation.result(timeout=10800)

        # Select highest confidence transcripts
        confidence_sum = 0
        segments = 0

        # Create timestamped sentences
        timestamped_sentences: List[transcript_model.Sentence] = []
        transcript_sentence_index = 0

        # Create sentence boundary pipeline
        nlp = English()
        nlp.add_pipe("sentencizer")

        for result in response.results:
            # Some portions of audio may not have text
            if len(result.alternatives) > 0:
                # Split transcript into sentences
                doc = nlp(result.alternatives[0].transcript)

                # Convert generator to list
                sentences = [str(sent) for sent in doc.sents]

                # Index holder for word results of response
                w_marker = 0
                for s_ind, _ in enumerate(sentences):
                    # Sentence text
                    s_text = sentences[s_ind]

                    num_words = len(s_text.split())

                    # Initialize sentence model
                    timestamped_sentence = transcript_model.Sentence(
                        index=transcript_sentence_index,
                        confidence=result.alternatives[0].confidence,
                        # Start and end time are placeholder values
                        start_time=0.0,
                        end_time=0.0,
                        words=[],
                        text=s_text,
                    )

                    for w_ind in range(w_marker, w_marker + num_words):
                        # Extract word from response
                        word = result.alternatives[0].words[w_ind]

                        # Nanos no longer supported, use microseconds instead
                        # https://github.com/googleapis/python-speech/issues/71
                        start_time = (word.start_time.seconds +
                                      word.start_time.microseconds * 1e-6)

                        end_time = (word.end_time.seconds +
                                    word.end_time.microseconds * 1e-6)

                        # Add start_time to Sentence if first word
                        if w_ind - w_marker == 0:
                            timestamped_sentence.start_time = start_time

                        # Add end_time to Sentence if last word
                        if (w_ind - w_marker) == (num_words - 1):
                            timestamped_sentence.end_time = end_time

                        # Create Word model
                        timestamped_word = transcript_model.Word(
                            index=w_ind - w_marker,
                            start_time=start_time,
                            end_time=end_time,
                            text=self._clean_word(word.word),
                        )

                        timestamped_sentence.words.append(timestamped_word)

                    # Increment word marker
                    w_marker += num_words

                    # Add Sentence to sentence list
                    timestamped_sentences.append(timestamped_sentence)

                    # Increment transcript sentence index
                    transcript_sentence_index += 1

                # Update confidence stats
                confidence_sum += result.alternatives[0].confidence
                segments += 1

        # Compute mean confidence
        if segments > 0:
            confidence = confidence_sum / segments
        else:
            confidence = 0.0
        log.info(
            f"Completed transcription for: {file_uri}. Confidence: {confidence}"
        )

        # Create transcript model
        transcript = transcript_model.Transcript(
            generator=f"Google Speech-to-Text -- CDP v{__version__}",
            confidence=confidence,
            session_datetime=None,
            created_datetime=datetime.utcnow().isoformat(),
            sentences=timestamped_sentences,
        )

        return transcript
def transcribe_gcs(gcs_uri, num_speakers):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""

    # Imports the Google Cloud client library
    #from google.cloud import speech
    from google.cloud import speech_v1p1beta1 as speech


    # Instantiates a client
    client = speech.SpeechClient()
    
    # Construct a recognition metadata object
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE
    )
    metadata.audio_topic = "court trial hearing" 
    metadata.original_mime_type = "audio/mp3"

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=num_speakers,
        # Enhanced models cost more than standard models. 
        use_enhanced=True,
        model="video",
        enable_word_time_offsets=True,

    )

    # Detects speech in the audio file -- long audio file
    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=300)

    # Writing results to json

    result_counter = 0 
    word_counter = 0 
    output_json = {}

    for result in response.results:
        alternative = result.alternatives[0]
        output_json[f"{result_counter}_Transcript"] =  alternative.transcript
        output_json[f"{result_counter}_Confidence"] =  alternative.confidence
        result_counter += 1

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            speaker_tag = word_info.speaker_tag

            output_json[f"{word_counter}_Word"] =  word
            output_json[f"{word_counter}_start_time"] =  start_time.total_seconds()
            output_json[f"{word_counter}_end_time"] =  end_time.total_seconds()
            output_json[f"{word_counter}_speaker_tag"] =  speaker_tag

            word_counter += 1

    with open("{}.json".format(gcs_uri.split('/')[-1][:-5]) , "w+") as file:
        json.dump(output_json, file)
    

    print("Dirized and transcribed {}".format(gcs_uri.split('/')[-1]))