def transcribe_gcs(gcs_uri):
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    import time
    start = time.time()
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=9000000)

    filename = gcs_uri.split('/')[3][0:-5]
    with open(filename+".txt","w") as gsp:
        for result in response.results:
            gsp.write(result.alternatives[0].transcript)
    end = time.time()
    print(end-start)
Example #2
0
def transcribe_gcs(gcs_uri, language_code):
    """Transcribes the audio file specified by the gcs_uri."""
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code=language_code,
        enable_word_time_offsets=True)

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=120)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        words = [w.word for w in result.alternatives[0].words]
        word_counts = Counter(words)
        print(word_counts)
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Example #4
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    with open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        #       encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='ko-KR')

    response = client.recognize(config, audio)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    try:
        return (response.results[0].alternatives[0].transcript)
    except (IndexError):
        return ("Out")
Example #5
0
    def __init__(self, callback):

        Thread.__init__(self)
        self.callback = callback
        self.client = speech.SpeechClient()

        # See http://g.co/cloud/speech/docs/languages
        # for a list of supported languages.
        language_code = 'en-US'  # a BCP-47 language tag

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=RATE,
            language_code=language_code)

        self.streaming_config = types.StreamingRecognitionConfig(
            config=config,
            interim_results=True)


        while True:
            print("Listen again?")
            self.listen()
Example #6
0
def googleApiCall(path):

    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = path  #'/home/sanghs3/Capstone/umm.flac'

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=48000,
        language_code='en-US',
        enable_word_time_offsets=True)

    # Detects speech in the audio file
    response = client.recognize(config, audio)
    response = formatResponse(response)
    return response
Example #7
0
def transcribe_gcs(gcs_uri):
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=100)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))
Example #8
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    import io
    import json
    client = speech.SpeechClient()

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US')

    response = client.recognize(config, audio)
    # Print the first alternative of all the consecutive results.
    # print(json.dumps(response))
    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript))
Example #9
0
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        language_code='pt-BR')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=1800)

    print('Done with operation')

    # Print the first alternative of all the consecutive results.
    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))
Example #10
0
def audio_to_text():
    # Instantiates a client
    client = speech.SpeechClient()
    file_name = "/Users/amirulislam/Downloads/announce.wav"
    print("File is ", file_name)

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code='ar-QA',
        audio_channel_count=1)

    # Detects speech in the audio file
    response = client.recognize(config, audio)

    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript))
    return response.results[0].alternatives[0].transcript
Example #11
0
    def __init__(self,
                 credentials=None,
                 RATE=16000,
                 CHUNK=1024,
                 language_code='en-US'):
        self.RATE = RATE
        self.CHUNK = CHUNK
        self.language_code = language_code
        if (credentials != None):
            credentials = service_account.Credentials.from_service_account_file(
                credentials)
        self.client = speech.SpeechClient(credentials=credentials)

        #TODO: speech_contexts -> https://cloud.google.com/speech-to-text/docs/basics#phrase-hints
        self.config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=RATE,
            language_code=language_code,
            model='command_and_search',
            enable_automatic_punctuation=True)

        self.streaming_config = types.StreamingRecognitionConfig(
            config=self.config, interim_results=True, single_utterance=True)
Example #12
0
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US',
        enable_word_time_offsets=True)

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    result = operation.result(timeout=90)

    alternatives = result.results[0].alternatives
    for alternative in alternatives:
        print('Transcript: {}'.format(alternative.transcript))
        print('Confidence: {}'.format(alternative.confidence))
def get_text(file_name):

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        #sample_rate_hertz=16000,
        language_code='en-US')
    
    #instantiate a client
    client = speech.SpeechClient()
    
    # Detects speech in the audio file
    response = client.recognize(config, audio)
    text = ""

    for result in response.results:
        text += result.alternatives[0].transcript
        #print('Transcript: {}'.format(result.alternatives[0].transcript))
    return text
Example #14
0
def main():
   
    language_code = 'zh-TW'
    client = speech.SpeechClient()
    config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=RATE,
            language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
            config=config,
            #自己測試
            # single_utterance=True,
            interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Example #15
0
    def google_stt_streaming(self, socket_action):
        # See http://g.co/cloud/speech/docs/languages
        # for a list of supported languages.
        language_code = 'ko-KR'  # a BCP-47 language tag

        # for content in comuni.get_data(client_record):
        #     print("Type >> {}".format(type(content)))

        client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=RATE,
            language_code=language_code)
        streaming_config = types.StreamingRecognitionConfig(
            config=config, interim_results=True)
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in socket_action.get_data())

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        stt_text = self.listen_print_loop(responses)
        return stt_text
Example #16
0
def recognize():
    global text
    global conf
    # Instantiates a client
    client = speech.SpeechClient()
    # The name of the audio file to transcribe
    file_name = 'output.wav'
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US')
    # Detects speech in the audio file
    response = client.recognize(config, audio)

    for result in response.results:
        text = str(result.alternatives[0].transcript)
        conf = str(result.alternatives[0].confidence)

    return text, conf
Example #17
0
    def run(self): 
        client = speech.SpeechClient()
        config = types.RecognitionConfig(
                    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                    sample_rate_hertz=RATE,
                    language_code='en-US')
        while not done.is_set():
            try:
                data = stt_buffer.get(timeout=0.1)
            except queue.Empty:
                continue

            print(datetime.datetime.now(), "Requesting transcription of %d data frames" % data.shape[0])
            audio = types.RecognitionAudio(content=data.tobytes())
            response = client.recognize(config, audio)
            print(datetime.datetime.now(), "Transcription complete: %r" % response) 
            for result in response.results:
                text = result.alternatives[0].transcript
                print(datetime.datetime.now(), "Text: %r" % text) 
                if text: 
                    text_out.put(text)
                else:
                    print(datetime.datetime.now(), 'Decoding complete. No text returned')
def transcribe_large(full_audio_name, frame_rate, language_code):
    audio_file_name = ntpath.basename(full_audio_name)

    upload_to_bucket(const.TRANSCRIPTION_GOOGLE_BUCKET, full_audio_name,
                     audio_file_name)

    gcs_uri = 'gs://' + const.TRANSCRIPTION_GOOGLE_BUCKET + '/' + audio_file_name

    client = speech.SpeechClient()
    audio = speech_types.RecognitionAudio(uri=gcs_uri)

    config = speech_types.RecognitionConfig(
        encoding=speech_enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code=language_code,
    )

    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)

    delete_from_bucket(const.TRANSCRIPTION_GOOGLE_BUCKET, audio_file_name)
    return response
Example #19
0
def main():
    # language_code = 'en-US'  # a BCP-47 language tag te-IN en-IN
    language_code = 'en-IN'  # a BCP-47 language tag te-IN en-IN
    credentials = service_account.Credentials. from_service_account_file('googleKeys.json')
    client = speech.SpeechClient(credentials=credentials)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        print("inside stream")
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Example #20
0
    def analyze_text(self):
        """ Transcribe the speech file and returns it as a string;can only be 
        called after calling self.load(); file must be .wav, mono """
        from google.cloud import speech
        from google.cloud.speech import enums
        from google.cloud.speech import types
        client = speech.SpeechClient()

        with io.open(self.filename, 'rb') as audio_file:
            content = audio_file.read()

        audio = types.RecognitionAudio(content=content)
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            # sample_rate_hertz=44100,
            language_code='en-US')

        response = client.recognize(config, audio)
        text = ""
        for result in response.results:
            #print('Transcript: {}'.format(result.alternatives[0].transcript))
            text += '{}'.format(result.alternatives[0].transcript)
        return text
Example #21
0
    def start(self,window):
        self.window = window
        language_code = 'ko-KR'  # a BCP-47 language tag

        client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=RATE,
            language_code=language_code)
        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            interim_results=True)

        #무한으로 하려면 'while True :'
        with MicrophoneStream(RATE, CHUNK) as stream:
            audio_generator = stream.generator()
            requests = (types.StreamingRecognizeRequest(audio_content=content)
                        for content in audio_generator)

            responses = client.streaming_recognize(streaming_config, requests)

            # Now, put the transcription responses to use.
            self.listen_print_loop(responses)
Example #22
0
    def _transcribe(self, uri, audio):
        client = speech.SpeechClient()

        # NOTE -- according to https://cloud.google.com/speech/quotas  we are
        # limited to ~180 Minutes
        recog = types.RecognitionAudio(uri=uri)
        # https://cloud.google.com/speech/reference/rest/v1/RecognitionConfig
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.OGG_OPUS,
            sample_rate_hertz=audio.frame_rate,
            language_code=self.lang,
            enable_word_time_offsets=True)

        # https://cloud.google.com/speech/docs/async-recognize
        # https://googlecloudplatform.github.io/google-cloud-python/latest/speech/index.html#asynchronous-recognition
        operation = client.long_running_recognize(config, recog)

        start = time.time()
        logger.info('Beginning transcription')
        response = operation.result(timeout=self.TIMEOUT)
        stop = time.time()
        logger.info("Transcription finished in {}".format(Time(stop - start)))
        return response
Example #23
0
 def translate_with_timestamps(self, gs_uri):
     audio = types.RecognitionAudio(uri=gs_uri, )
     config = types.RecognitionConfig(
         encoding='FLAC',
         language_code='en-US',
         # sample_rate_hertz=44100,
         enable_word_time_offsets=True)
     operation = self.client.long_running_recognize(config=config,
                                                    audio=audio)
     results = []
     for result in operation.result().results:
         alternatives = result.alternatives
         if len(alternatives) == 0:
             continue
         alternative = alternatives[0]
         for word_info in alternative.words:
             word = word_info.word
             start_time = word_info.start_time.seconds + round(
                 word_info.start_time.nanos * 1e-9, 1)
             end_time = word_info.end_time.seconds + round(
                 word_info.end_time.nanos * 1e-9, 1)
             results.append([word, start_time, end_time])
     return results
Example #24
0
def transcribe_gcs(uri_gs):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=uri_gs)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        language_code='ja-JP')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=9000)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    with open('{}.txt'.format(uri_gs.split('/')[-1]), 'w') as f:
        for result in response.results:
            # The first alternative is the most likely one for this portion.
            f.write(result.alternatives[0].transcript)
Example #25
0
def audio_to_text(fname):
    flacfile = mp3_to_flac(fname)

    client = speech.SpeechClient()

    
    transcript = '' 
    with io.open(flacfile, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
            sample_rate_hertz=22050,
            language_code='en-US')
                
        # Detects speech in the audio file
        response = client.recognize(config, audio)
        for result in response.results:
            transcript = format(result.alternatives[0].transcript)
             
         
    #print('hs', transcript)
    return transcript
Example #26
0
    def __init__(self, engine, args, loop):

        Thread.__init__(self)
        self.client = speech.SpeechClient()
        self.args = args
        self.engine = engine
        self.stop_recognition = False
        self.loop = loop
        self.role = "UNKOWN"
        self.crash = False

        # See http://g.co/cloud/speech/docs/languages
        # for a list of supported languages.
        language_code = 'en-US'  # a BCP-47 language tag

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=RATE,
            language_code=language_code)

        self.streaming_config = types.StreamingRecognitionConfig(
            config=config,
            interim_results=True)
Example #27
0
def transcribe_file(speech_file):

    client = speech.SpeechClient()

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    for result in response.results:
        print(u'{}'.format(result.alternatives[0].transcript))
        f = open("./test.tmp", 'w')
        f.write(result.alternatives[0].transcript)
        f.close()
Example #28
0
def transcribe_file(speech_file, textFile):
    """Transcribe the given audio file."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='es-MX')

    response = client.recognize(config, audio)
    fileContent = open(textFile, 'w')
    for index, result in enumerate(response.results):
        responseIndex = result.alternatives[0].transcript.encode('utf-8', 'strict')
        print(responseIndex)
        fileContent.write(str(responseIndex))
    fileContent.close()
Example #29
0
    def transcribe_file(self, file_name):
        # Instantiates a client
        client = self.client

        # Loads the audio into memory
        with io.open(file_name, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=8000,
            language_code='en-US')

        # Detects speech in the audio file
        response = client.recognize(config, audio)

        transcripts = []
        for result in response.results:
            if (result.alternatives[0].transcript):
                transcripts.append(result.alternatives[0].transcript)

        return transcripts
Example #30
0
def transcribe_streaming(stream_file):
    """Streams transcription of the given audio file."""
    import io
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    with io.open(stream_file, 'rb') as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]
    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='es-es')
    streaming_config = types.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    responses = client.streaming_recognize(streaming_config, requests)

    for response in responses:
        # Once the transcription has settled, the first result will contain the
        # is_final result. The other results will be for subsequent portions of
        # the audio.
        for result in response.results:
            print('Finished: {}'.format(result.is_final))
            print('Stability: {}'.format(result.stability))
            alternatives = result.alternatives
            # The alternatives are ordered from most likely to least.
            for alternative in alternatives:
                print('Confidence: {}'.format(alternative.confidence))
                print(u'Transcript: {}'.format(alternative.transcript))