Beispiel #1
0
def async_dictate(storage_uri, encoding, sample_rate_hertz, language_code):
    """
    Transcribe long audio file from Cloud Storage using asynchronous speech
    recognition

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    logging.info("Waiting for dictation results...")
    response = operation.result()

    text = []
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        text.append(alternative.transcript)

    logging.info("Dictation successful.")

    return text
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    # The language code you speak.
    language_code = 'th-TH'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    # Initial loop value
    rounds = 1
    while True:
        try:
            print('streaming loop :' + str(rounds))
            with MicrophoneStream(RATE, CHUNK) as stream:
                audio_generator = stream.generator()
                # Create request data
                requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator)
                # POST data to google cloud speech
                responses = client.streaming_recognize(streaming_config, requests)
                # Now, put the transcription responses to use.
                listen_print_loop(responses)
        except Exception as err:
            print(err)
            rounds += 1
Beispiel #3
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech_v1
    from google.cloud.speech_v1 import enums
    from google.cloud.speech_v1 import types
    import io
    client = speech_v1.SpeechClient()

    # [START speech_python_migration_sync_request]
    # [START speech_python_migration_config]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='ja-JP')
    # [END speech_python_migration_config]

    # [START speech_python_migration_sync_response]
    response = client.recognize(config, audio)
    # [END speech_python_migration_sync_request]
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
Beispiel #4
0
def local_short_recognition(local_file_path):
    """This function transcribes a local short audio file using speech
    recognition.

    Parameters
    ----------
    local_file_path : Path to local audio file, e.g. /path/audio.wav.

    Returns
    -------

    """
    set_path()
    client = speech_v1.SpeechClient()

    with io.open(local_file_path, "rb") as f:
        content = f.read()

    audio = {"content": content}

    response = client.recognize(
        {
            "language_code": "pt-BR",
            "sample_rate_hertz": 16000,
            "encoding":
            enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED
        }, audio)

    for result in response.results:
        alternative = result.alternatives[0]
        print(u"{}".format(alternative.transcript))
Beispiel #5
0
def transcribe_audio_to_text(local_file_path: str):
    """
    Transcribe a short audio file using Google synchronous speech recognition

    """
    client = speech_v1.SpeechClient()

    # The language of the supplied audio
    language_code = "en-US"

    encoding = enums.RecognitionConfig.AudioEncoding.FLAC
    config = {
        "language_code": language_code,
        "encoding": encoding,
    }

    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    response = client.recognize(config, audio)

    for result in response.results:
        for alternative in result.alternatives:
            #Capture all transcription alternatives that Google created
            f = open('flacs-transcribed/' + local_file_path[6:-4] + 'txt', 'a')
            f.write("Transcript : " + alternative.transcript + " ")

    return
def sample_recognize(language_code, gcs_uri):
    """Transcribe audio file from Google Cloud Storage with word time offsets"""

    # [START speech_transcribe_async_word_time_offsets_gcs_core]

    client = speech_v1.SpeechClient()

    # language_code = 'en-US'
    # gcs_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    if isinstance(language_code, six.binary_type):
        language_code = language_code.decode('utf-8')
    if isinstance(gcs_uri, six.binary_type):
        gcs_uri = gcs_uri.decode('utf-8')
    sample_rate_hertz = 16000
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    enable_word_time_offsets = True
    config = {
        'sample_rate_hertz': sample_rate_hertz,
        'language_code': language_code,
        'encoding': encoding,
        'enable_word_time_offsets': enable_word_time_offsets
    }
    audio = {'uri': gcs_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        alternative = result.alternatives[0]
        print('Transcript: {}'.format(alternative.transcript))
        for word_info in alternative.words:
            print('Word: {}'.format(word_info.word))
            print('Word start time: {} seconds, {} nanos'.format(
                word_info.start_time.seconds, word_info.start_time.nanos))
            print('Word end time: {} seconds, {} nanos'.format(
                word_info.end_time.seconds, word_info.end_time.nanos))
Beispiel #7
0
def long_running_recognize(args):
    """
    Transcribe long audio file from Cloud Storage using asynchronous speech
    recognition

    Args:
      storage_uri URI for audio file in GCS, e.g. gs://[BUCKET]/[FILE]
    """

    print("Transcribing {} ...".format(args.storage_uri))
    client = speech_v1.SpeechClient()

    # Encoding of audio data sent.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "enable_word_time_offsets": True,
        "enable_automatic_punctuation": True,
        "sample_rate_hertz": args.sample_rate_hertz,
        "language_code": args.language_code,
        "encoding": encoding,
    }
    audio = {"uri": args.storage_uri}

    operation = client.long_running_recognize(config, audio)
    response = operation.result()

    subs = []

    for result in response.results:
        # First alternative is the most probable result
        subs = break_sentences(args, subs, result.alternatives[0])

    print("Transcribing finished")
    return subs
Beispiel #8
0
def sample_long_running_recognize(language_code, local_file_path):
    """Transcribe local audio file asynchronously"""

    # [START speech_transcribe_async_core]

    client = speech_v1.SpeechClient()

    # language_code = 'en-US'
    # local_file_path = 'Path to local audio file, e.g. /path/audio.wav'

    if isinstance(language_code, six.binary_type):
        language_code = language_code.decode('utf-8')
    if isinstance(local_file_path, six.binary_type):
        local_file_path = local_file_path.decode('utf-8')
    sample_rate_hertz = 16000
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        'sample_rate_hertz': sample_rate_hertz,
        'language_code': language_code,
        'encoding': encoding
    }
    with io.open(local_file_path, 'rb') as f:
        content = f.read()
    audio = {'content': content}

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result()

    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript))
Beispiel #9
0
def recognize(local_file_path):

    client = speech_v1.SpeechClient()

    # The language of the supplied audio
    language_code = "ko-KR"

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 8000

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    audio_channel_count = 1
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": encoding,
        "audio_channel_count": audio_channel_count,
    }
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()
    for result in response.results:
        alternatives = result.alternatives
        for alternative in alternatives:
            # alternative = result.alternatives[0]
            print(u"Transcript: {}".format(alternative.transcript))
            print(u"Confidence: {}".format(alternative.confidence))
def sample_recognize(storage_uri, model):
    """
    Transcribe a short audio file from Cloud Storage using a specified
    transcription model

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
      model The transcription model to use, e.g. video, phone_call, default
      For a list of available transcription models, see:
      https://cloud.google.com/speech-to-text/docs/transcription-model#transcription_models
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/hello.wav'
    # model = 'phone_call'

    # The language of the supplied audio
    language_code = "en-US"
    config = {"model": model, "language_code": language_code}
    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))
Beispiel #11
0
 def __init__(self, language, supportedLanguages):
     # self.language = best_match(language, supportedLanguages)[0]
     self.language = "de"
     logging.info("created speech input for language: " + self.language)
     if (self.language == None or self.language == "und"):
         raise ValueError("Language is not supported")
     self.client = speech_v1.SpeechClient()
Beispiel #12
0
    def transcriber(self) -> str:
        """
        Transcribe a short audio file using asynchronous speech recognition

        Args:
          local_file_path Path to local audio file, e.g. /path/audio.wav
        """
        client = speech_v1.SpeechClient()
        language_code = self.language_code

        # Sample rate in Hertz of the audio data sent
        sample_rate_hertz = self.sample_rate_herts

        # Encoding of audio data sent. This sample sets this explicitly.
        # This field is optional for FLAC and WAV audio formats.
        encoding = enums.RecognitionConfig.AudioEncoding.AMR
        config = {
            "language_code": language_code,
            "sample_rate_hertz": sample_rate_hertz,
            "encoding": encoding,
        }
        with io.open(self.audio_file.file_location.path, "rb") as f:
            content = f.read()
        audio = {"content": content}
        # response = client.recognize(config, audio) this is for the short files
        operation = client.long_running_recognize(config, audio)
        response = operation.result()
        content = [
            result.alternatives[0].transcript for result in response.results
        ]
        return "".join(content)
def sample_recognize(storage_uri):
    """
    Transcribe short audio file from Cloud Storage using synchronous speech
    recognition
    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 16000

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))
Beispiel #14
0
def speech_to_text(local_file_path):
    client = speech_v1.SpeechClient()
    language_code = "en-US"
    sample_rate_hertz = 48000

    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        # "encoding": encoding,
    }
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}
    f = sf.SoundFile(local_file_path)
    aud_len = len(f) / f.samplerate
    if aud_len < 60:
        response = client.recognize(config, audio)
    else:
        dest_name = str(uuid.uuid4()) + '.wav'
        upload_blob(bucket_name="patched_video_output",
                    source_file_name=local_file_path,
                    destination_blob_name=dest_name)
        cloud_uri = 'gs://' + 'patched_video_output/' + dest_name
        print(cloud_uri)
        audio = {"uri": cloud_uri}
        operation = client.long_running_recognize(config, audio)
        response = operation.result()
    transcripted_text = []
    for result in response.results:
        alternative = result.alternatives[0]
        transcripted_text.append(alternative.transcript)
    # print(u"Transcript: {}".format(alternative.transcript))
    return transcripted_text
Beispiel #15
0
def sample_long_running_recognize(storage_uri):
    """
    Transcribe long audio file from Cloud Storage using asynchronous speech
    recognition

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1.SpeechClient()
    enable_word_time_offsets = True

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    # Sample rate in Hertz of the audio data sent
    # sample_rate_hertz = 16000
    sample_rate_hertz = 48000
    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    # encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "enable_word_time_offsets": enable_word_time_offsets
        # "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    response_dict = {
        "transcript": "",
        "word_timestamps": defaultdict(list),
        "video_url": "https://www.youtube.com/watch?v=PqMGmRhKsnM"
    }
    print("response", response.results)

    for result in response.results:
        alternative = result.alternatives[0]
        response_dict["transcript"] += alternative.transcript

        print(u"Transcript: {}".format(alternative.transcript))
        # Print the start and end time of each word
        for word in alternative.words:
            print(u"Word: {}".format(word.word))
            response_dict["word_timestamps"][word.word.lower()].append(
                word.start_time.seconds)

            print(u"Start time: {} seconds {} nanos".format(
                word.start_time.seconds, word.start_time.nanos))
            print(u"End time: {} seconds {} nanos".format(
                word.end_time.seconds, word.end_time.nanos))

    print(json.dumps(response_dict))
Beispiel #16
0
def stt(file, channel, hertz, languageCode):
    print('exec stt: ', file)
    client = speech_v1.SpeechClient()
    config = {
        "language_code": languageCode,
        "sample_rate_hertz": hertz,
        "audio_channel_count": channel
    }
    targetDir, targetFile = os.path.split(file)
    gcsURL = f'gs://{bucketName}/{targetFile}'
    audio = {"uri": gcsURL}
    try:
        # アウトプット用ファイル設定
        outputFile = os.path.join(targetDir,
                                  'STT_' + targetFile.replace('.wav', '.txt'))
        with open(outputFile, mode='w') as f:
            # STT実行
            operation = client.long_running_recognize(config, audio)
            response = operation.result()
            for result in response.results:
                # First alternative is the most probable result
                alternative = result.alternatives[0]
                print(u'Transcript: {}'.format(alternative.transcript))
                f.write('{}\n'.format(alternative.transcript))
            print('done stt: ', file)
            print('#######')
    except Exception as e:
        print('stt() is ERROR:')
        print(e)
        print('#######')
Beispiel #17
0
def transcribe(local_file_path):
    """
    Transcribe a short audio file using synchronous speech recognition

    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """

    client = speech_v1.SpeechClient()

    # local_file_path = 'resources/brooklyn_bridge.raw'

    # The language of the supplied audio
    language_code = "te-IN"

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 16000

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": encoding,
    }
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        return (u"{}".format(alternative.transcript))
Beispiel #18
0
def sample_long_running_recognize(storage_uri):
    """This function transcribes a cloud long audio file using speech
    recognition.

    Parameters
    ----------
    storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]

    Returns
    -------

    """
    set_path()
    client = speech_v1.SpeechClient()
    audio = {"uri": storage_uri}
    operation = client.long_running_recognize(
        {
            "sample_rate_hertz": 16000,
            "language_code": "pt-BR",
            "encoding":
            enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        }, audio)

    for result in operation.result().results:
        alternative = result.alternatives[0]
        print(u"{}".format(alternative.transcript))
Beispiel #19
0
def sample_long_running_recognize(storage_uri):
    """
    Transcribe long audio file from Cloud Storage using asynchronous speech
    recognition

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 44100

    # The language of the supplied audio
    language_code = "ko-KR"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    # encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": 'FLAC',
        "audio_channel_count": 2,
        "enable_word_time_offsets": True,
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    f = open("./data_output.txt", mode='wt')
    cf = open("./data_output.csv", 'w')
    wr = csv.writer(cf)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))
        for word in alternative.words:
            lst = []
            print(u"Word: {}".format(word.word))
            f.write(word.word)
            f.write(" ")
            lst.append(word.word)
            print(u"Start time: {} seconds {} nanos".format(
                word.start_time.seconds, word.start_time.nanos))
            f.write(str(word.start_time.seconds))
            lst.append(word.start_time.seconds)
            f.write('\n')
            print(u"End time: {} seconds {} nanos".format(
                word.end_time.seconds, word.end_time.nanos))
            wr.writerow(lst)
    f.close()
    cf.close()
Beispiel #20
0
def speech2text(audio):
    client = speech.SpeechClient()
    try:
        response = client.recognize(config=config, audio=audio)
        sents = get_sentences(response)
        return sents
    except:
        return 'Not recognized'
Beispiel #21
0
def sample_recognize(file, config={}):

    if file == None:
        logger.error("please input target image file on first argument.")
        return None
    elif type(file) not in (str, FileStorage):
        logger.error(
            "please input target filename or FileStorage on first argument.")
        return None

    if config is None:
        logger.error("please set config argument")
        return None
    elif type(config) is not dict:
        logger.error("argument config need to dict type.")
        return None
    else:
        if config.get("language_code", None) is None:
            logger.error("please set config.language_code ")
            return None
        if config.get("sample_rate_hertz", None) is None:
            logger.error("please set config.sample_rate_hertz ")
            return None
        if config.get("max_alternatives", None) is None:
            logger.error("please set config.max_alternatives ")
            return None

    client = speech_v1.SpeechClient()

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    cfg = {
        "language_code": config.get("language_code", None),
        "sample_rate_hertz": config.get("sample_rate_hertz", None),
        "encoding": encoding,
        "max_alternatives": config.get("max_alternatives", None),
    }

    audio = None
    if type(file) is str:
        with io.open(file, "rb") as f:
            content = f.read()
        audio = {"content": content}
    elif type(file) is FileStorage:
        content = file.stream.read()
        audio = {"content": content}
        file.stream.close()

    strs = []
    response = client.recognize(cfg, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        logger.info(u"Transcript: {}".format(alternative.transcript))
        strs.append(alternative.transcript)

    return strs
Beispiel #22
0
def sample_recognize(filename):
    """
    Transcribe a short audio file using synchronous speech recognition
    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """

    if (filename.endswith(".MOV")):  #or .avi, .mpeg, whatever.
        mp3_filename = filename[:-4] + ".mp3"
        flac_filename = filename[:-4] + ".flac"
        monoFlac_filename = "mono" + flac_filename

        #subprocess.call(['ffmpeg', '-i', filename, mp3_filename])
        #subprocess.call(['ffmpeg', '-i', mp3_filename, '-f', 'flac', flac_filename])
        #subprocess.call(['ffmpeg', '-i', flac_filename, '-ac', '1', monoFlac_filename])

        os.system("ffmpeg -i " + filename + " " + mp3_filename +
                  " &> /dev/null")
        os.system("ffmpeg -i " + mp3_filename + " -f flac " + flac_filename +
                  " &> /dev/null")
        os.system("ffmpeg -i " + flac_filename + " -ac 1 " +
                  monoFlac_filename + " &> /dev/null")
    else:
        pass

    if (filename.endswith(".flac") and filename[0:4] == "mono"):
        metadata = audio_metadata.load(filename)
        sample_frequency = metadata['streaminfo']['sample_rate']
    else:
        sample_frequency = 44100

    client = speech_v1.SpeechClient()

    # The language of the supplied audio
    language_code = "en-US"

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = sample_frequency

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    #encoding = enums.RecognitionConfig.AudioEncoding.MP3
    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": "FLAC",
    }
    with io.open(monoFlac_filename, "rb") as f:
        content = f.read()
    audio = {"content": content}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]

        return "{}".format(alternative.transcript)
def sample_recognize(local_file_path, output_dir, language):
    """
    Transcribe a short audio file using synchronous speech recognition

    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """


    filename = local_file_path.split('/')[-1][:-3] + 'txt'
    filepath_text =  os.path.join(output_dir,filename)

    # if continue_skip and os.path.exists(filepath_text):
    #     return

    client = speech_v1.SpeechClient()

    # local_file_path = 'resources/brooklyn_bridge.raw'

    # The language of the supplied audio
    language_code = language

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 16000

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": encoding,

        "audio_channel_count": 1,
        "enable_word_time_offsets": True,
        "enable_automatic_punctuation":False
    }

    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    #print("Starting Call")
    response = client.recognize(config, audio)


    #print("Call ended")
    transcript = ""
    if response:
        if response.results and response.results[0]:
            if response.results[0].alternatives and response.results[0].alternatives[0]:
                transcript = response.results[0].alternatives[0].transcript
   

    
    with open(filepath_text, 'w+', encoding='utf8') as file:
        file.write(transcript)
def convert_mp3_to_speech():
    latest_audio_file = _get_latest_audio_file()
    print("latest_audio_file: ", latest_audio_file)
    storage_uri = "gs://us-west3-video-enhancer-bb0ff304-bucket/audio_files/" + latest_audio_file

    print("storage_uri: ", storage_uri)

    client = speech_v1.SpeechClient()
    enable_word_time_offsets = True

    sample_rate_hertz = 48000
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    # encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "enable_word_time_offsets": enable_word_time_offsets
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    print("Waiting for operation to complete...")
    response = operation.result()

    response_dict = {
        "transcript": "",
        "word_timestamps": defaultdict(list),
        "video_url": YOUTUBE_URL
    }
    print("response from SpeechToText", response.results)

    verbose = False

    for result in response.results:
        alternative = result.alternatives[0]
        response_dict["transcript"] += alternative.transcript

        for word in alternative.words:
            response_dict["word_timestamps"][word.word.lower()].append(
                word.start_time.seconds)

            if verbose:
                print(u"Word: {}".format(word.word))

                print(u"Start time: {} seconds {} nanos".format(
                    word.start_time.seconds, word.start_time.nanos))
                print(u"End time: {} seconds {} nanos".format(
                    word.end_time.seconds, word.end_time.nanos))

    json_data_for_search_indexing = json.dumps(response_dict)
    print("json_data_for_search_indexing: ", json_data_for_search_indexing)

    return json_data_for_search_indexing
Beispiel #25
0
def sample_long_running_recognize(request=''):
    bucket_name = 'visumm-store'

    # S2T
    client = speech_v1.SpeechClient()

    # read input arguments
    if request.args and 'input_filename' in request.args:
        input_filename = request.args.get('input_filename')
        print('got input_filename: ', input_filename)
    else:
        print('ERROR: no input_filename was provided. existing')
        return

    # input filepath as GCS uri
    storage_uri = 'gs://' + bucket_name + '/' + input_filename

    # S2T config
    sample_rate_hertz = 44100
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.FLAC
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "audio_channel_count": 2,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    full_transcript = ''
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        full_transcript = full_transcript + alternative.transcript + '\n'
    print('full_transcript: \n', full_transcript)

    # write to file
    local_fpath = '/tmp/full_transcript.txt'
    with open(local_fpath, 'w') as f:
        f.write(full_transcript)

    # output filename to store
    output_filename = input_filename[0:-5]  # remove the .flac extension
    output_filename = output_filename + '.txt'  # add .txt extension
    upload_blob(local_fpath, output_filename)


#sample_long_running_recognize()
Beispiel #26
0
 def __init__(self):
     from google.cloud import speech_v1
     from google.cloud.speech_v1 import enums
     self.client = speech_v1.SpeechClient()
     self.config = {
         'model': 'video',
         'language_code': 'en-US',
         'sample_rate_hertz': 16000,
         'encoding': enums.RecognitionConfig.AudioEncoding.LINEAR16,
     }
Beispiel #27
0
    def sample_recognize(self, local_file_path, filename, path):
        client = speech_v1.SpeechClient()
        language_code = "en-US"
        enable_word_time_offsets = True
        use_enhanced = True
        encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16

        config = {
            "language_code": language_code,
            "encoding": encoding,
            "enable_word_time_offsets": enable_word_time_offsets,
            "use_enhanced": use_enhanced,
        }
        with io.open(local_file_path, "rb") as f:
            content = f.read()
        audio = {"content": content}

        swears = [
            'f**k', 'shit', 'ass', 'bitch', 'w***e', 'dick', 'fuk', "bitches",
            "sexual", "anus", "asshole", 'f*****g'
        ]

        response = client.recognize(config, audio)
        ts = []
        for result in response.results:

            alternative = result.alternatives[0]
            print(u"Transcript: {}".format(alternative.transcript))

            for j in alternative.words:
                for key in swears:
                    if key in j.word:
                        start = j.start_time.nanos / (10**
                                                      9) + j.start_time.seconds
                        end = j.end_time.nanos / (10**9) + j.end_time.seconds
                        print(j.word, start, end)
                        se = [start, end]
                        ts.extend(se)
                        print(ts)

        bl = ''
        for i in range(0, len(ts), 2):
            bl += 'between(t\,{0}\,{1})+'.format(ts[i], ts[i + 1])
        if (len(bl) != 0):
            bl = bl[:-1]
            os.system(
                '''ffmpeg -i {0} -max_muxing_queue_size 1024 -c:v copy -af volume=0:enable='{1}' {2}'''
                .format(path + filename + ".mp4", bl,
                        path + filename + "filtered" + ".mp4"))
        else:
            print(path, filename)
            os.chdir("/Users/VAISHNAVI/Desktop/mini/uploadedfiles")
            command = "copy {0} {1}".format(filename + ".mp4",
                                            filename + "filtered" + ".mp4")
            subprocess.call(command, shell=True)
Beispiel #28
0
def speechToText(speakerProfileId, storage_uri):
    print(speakerProfileId)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json"
    client = speech_v1.SpeechClient()
    sample_rate_hertz = 16000
    language_code = "en-US"
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
    }

    with io.open('/tmp/' + storage_uri, "rb") as f:
        content = f.read()
    audio = {"content": content}

    operation = client.long_running_recognize(config, audio)
    response = operation.result()
    transcript = ""
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        transcript += alternative.transcript
    fireStoreClient = firestore.Client()
    docReference = fireStoreClient.collection("enrolledUsers").document(
        speakerProfileId).get().to_dict()
    userName = docReference["name"]
    docReferenceScore = fireStoreClient.collection(
        'performanceScore').document(str(userName))
    importantWords = [
        "welcome", "thank you", "sorry", "apologise", "apologize", "good day",
        "nice day", "good morning", "good evening", "good noon", "awesome",
        "sweet", "hope", "see you", "bye", "hello", "hi", "please", "sure",
        "sort", "sorted", "enjoy", "safe"
    ]
    wordsSpoken = dict()
    print(transcript)

    for word in importantWords:
        if word in transcript:
            wordsSpoken[word] = transcript.count(word)
    print(wordsSpoken)
    try:
        word_dct = docReferenceScore.get().to_dict()
        for word, freq in wordsSpoken.items():
            if word in word_dct.keys():
                new_freq = word_dct[word]["frequency"] + freq
                docReferenceScore.set({word: {
                    "frequency": new_freq
                }},
                                      merge=True)
            else:
                docReferenceScore.set({word: {"frequency": freq}}, merge=True)
    except:
        for word, freq in wordsSpoken.items():
            docReferenceScore.set({word: {"frequency": freq}}, merge=True)
def sample_long_running_recognize(storage_path, save_path):
    """
    Print start and end time of each word spoken in audio file from Cloud Storage

    Args:
      storage_path can be URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
                        or local file path
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.flac'

    # When enabled, the first result returned by the API will include a list
    # of words and the start and end time offsets (timestamps) for those words.
    enable_word_time_offsets = True

    # The language of the supplied audio
    language_code = "en-US"
    encoding = enums.RecognitionConfig.AudioEncoding.FLAC
    config = {
        "enable_word_time_offsets": enable_word_time_offsets,
        "language_code": language_code,
        "audio_channel_count": 2,
        # "sample_rate_hertz": 8000,
        "encoding": encoding,
    }
    if 'gs://' == storage_path[:5]:
        audio = {"uri": storage_path}
    else:
        with io.open(storage_path, "rb") as f:
            content = f.read()
        audio = {"content": content}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    name = storage_path.split('/')[-1].split('.')[0]
    with open(save_path + 'google_result_' + name + '.txt', 'w') as f:
        for result in response.results:
            # First alternative is the most probable result
            alternative = result.alternatives[0]
            f.write(u"Transcript: {}".format(alternative.transcript) + '\n')
            # Write the start and end time of each word
            for word in alternative.words:
                f.write(u"Word: {}".format(word.word) + '\n')
                f.write(u"Start time: {} seconds {} nanos".format(
                    word.start_time.seconds, word.start_time.nanos) + '\n')
                f.write(u"End time: {} seconds {} nanos".format(
                    word.end_time.seconds, word.end_time.nanos) + '\n')
        f.close()
    print('analysis finished')
Beispiel #30
0
 def __init__(self):
     self.client = speech_v1.SpeechClient()
     self.uploader = Uploader()
     self.base_config = {
         "encoding":
         speech_v1.enums.RecognitionConfig.AudioEncoding.LINEAR16,
         "enable_word_time_offsets": True,
         "profanity_filter": False,
         "model": "default",
         "enable_automatic_punctuation": True
     }