Ejemplo n.º 1
0
def transcribe_file(speech_file):
    global TIMES, sentence, TRANSCRIPT
    """Transcribe the given audio file."""
    from google.cloud import speech
    import io

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code="en-US",
        audio_channel_count=2,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
    )

    response = client.recognize(config=config, audio=audio)

    print_sentences(response)
    i = len(TIMES) - 1
    del TIMES[i]
    return FULL_TRANSCRIPT, TRANSCRIPT, TIMES
Ejemplo n.º 2
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    import io

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        audio_channel_count=2,
        enable_separate_recognition_per_channel=True,
        language_code="en-US",
    )

    response = client.recognize(config=config, audio=audio)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        return "{}".format(result.alternatives[0].transcript)
Ejemplo n.º 3
0
    def test_streaming_recognize(self):

        try:
            BUCKET = os.environ["GOOGLE_CLOUD_TESTS_SPEECH_BUCKET"]
        except KeyError:
            BUCKET = "cloud-samples-tests"

        client = speech_v1.SpeechClient()

        config = speech_v1.RecognitionConfig(
            encoding=speech_v1.RecognitionConfig.AudioEncoding.FLAC,
            language_code="en-US",
            sample_rate_hertz=16000,
        )
        streamingConfig = speech_v1.StreamingRecognitionConfig(config=config)

        uri = "https://storage.googleapis.com/{}/speech/brooklyn.flac".format(BUCKET)
        streaming_requests = [
            speech_v1.StreamingRecognizeRequest(audio_content=requests.get(uri).content)
        ]

        responses = client.streaming_recognize(
            config=streamingConfig, requests=streaming_requests
        )

        for response in responses:
            for result in response.results:
                assert result.alternatives[0].transcript is not None
Ejemplo n.º 4
0
    def google_STT(self, audio):
        client = speech_v1.SpeechClient.from_service_account_json(
            '/data/second-conquest-293723-05738e995f8f.json')

        # Loads the audio into memory
        with io.open(audio, "rb") as audio_file:
            content = audio_file.read()
            audio = speech_v1.RecognitionAudio(content=content)

        encoding = speech_v1.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED

        config = speech_v1.RecognitionConfig(
            encoding=encoding,
            sample_rate_hertz=22050,
            language_code="en-US",
            enable_automatic_punctuation=True,
        )

        # Detects speech in the audio file
        start = time.time()
        response = client.recognize(request={"config": config, "audio": audio})
        text = ''
        for result in response.results:
            text = text + result.alternatives[0].transcript
            # print("Transcript: {}".format())
        return text
Ejemplo n.º 5
0
def speech_to_text(local_speech_file):
    client = speech.SpeechClient()
    with io.open(local_speech_file, "rb") as audio_file:
        audio_content = audio_file.read()

    audio = speech.RecognitionAudio(content=audio_content)
    config = speech.RecognitionConfig(
        dict(
            encoding=speech.RecognitionConfig.AudioEncoding.
            ENCODING_UNSPECIFIED,
            sample_rate_hertz=16000,
            language_code="zh-TW",
        ))

    response = client.recognize(config=config, audio=audio)
    print_sentences(response)
Ejemplo n.º 6
0
def main():

    #setting up the GTTS responses as .mp3 files!
    t2s = gTTS('You are the fairest of them all!', lang='en')
    t2s.save('fairest.mp3')
    t2s = gTTS('You didnt tell me what to do with that.', lang='en')
    t2s.save('idontknow.mp3')
    t2s = gTTS('Go bears!', lang='en')
    t2s.save('gobears.mp3')
    t2s = gTTS('Boo oo oo oo oo!')
    t2s.save('boo.mp3')

    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    # this code comes from Google Cloud's Speech to Text API!
    # Check out the links in your handout. Comments are ours.
    language_code = 'en-US'  # a BCP-47 language tag

    #set up a client
    client = speech_v1.SpeechClient()
    #make sure GCP is aware of the encoding, rate
    config = speech_v1.RecognitionConfig(
        encoding=speech_v1.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    #our example uses streamingrecognition - most likely what you will want to use.
    #check out the simpler cases of asychronous recognition too!
    streaming_config = speech_v1.StreamingRecognitionConfig(
        config=config, interim_results=True)

    #this section is where the action happens:
    #a microphone stream is set up, requests are generated based on
    #how the audiofile is chunked, and they are sent to GCP using
    #the streaming_recognize() function for analysis. responses
    #contains the info you get back from the API.
    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (speech_v1.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Ejemplo n.º 7
0
    def test_long_running_recognize(self):

        try:
            BUCKET = os.environ["GOOGLE_CLOUD_TESTS_SPEECH_BUCKET"]
        except KeyError:
            BUCKET = "cloud-samples-tests"

        client = speech_v1.SpeechClient()

        config = speech_v1.RecognitionConfig(
            encoding=speech_v1.RecognitionConfig.AudioEncoding.FLAC,
            language_code="en-US",
            sample_rate_hertz=16000,
        )

        uri = "gs://{}/speech/brooklyn.flac".format(BUCKET)
        audio = {"uri": uri}

        response = client.long_running_recognize(config=config, audio=audio)

        assert response.result() is not None
Ejemplo n.º 8
0
def STT(audio_path, save_path=None):
    client = speech_v1.SpeechClient.from_service_account_json(
        '/data/second-conquest-293723-05738e995f8f.json')

    # Loads the audio into memory
    with io.open(audio_path, "rb") as audio_file:
        content = audio_file.read()
        audio = speech_v1.RecognitionAudio(content=content)

    encoding = speech_v1.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED

    config = speech_v1.RecognitionConfig(
        encoding=encoding,
        sample_rate_hertz=22050,
        language_code="en-US",
        enable_automatic_punctuation=True,
    )

    # Detects speech in the audio file
    start = time.time()
    response = client.recognize(request={"config": config, "audio": audio})
    text = ''
    for result in response.results:
        text = text + result.alternatives[0].transcript
        # print("Transcript: {}".format())
    print(text)

    audio_name = audio_path.split('/')[-1].replace('.mp3', '')
    save_file_name = audio_name + GetCurrentDatetime() + '.txt'

    if save_path != None:
        os.makedirs(save_path, exist_ok=True)
        os.chdir(save_path)

    with open(save_file_name, 'w') as f:
        f.write(text)

    print('Inferred Audio File Name: ', audio_path)
    print('Transcribed Script File Saved: ', save_file_name)
    print('Processing Time: ', time.time() - start)
Ejemplo n.º 9
0
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'ja-JP'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = speech.StreamingRecognitionConfig(config=config,
                                                         interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (speech.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Ejemplo n.º 10
0
def convert(file, folder, pack):

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "api-key.json"
    files = sorted(os.listdir(str(file) + '/'))
    all_text = []
    for f in files:
        name = str(file) + '/' + f
        print("Transcribing File- " + str(name))
        with open(name, "rb") as audio_file:
            content = audio_file.read()
        try:
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                language_code="en-US")
            audio = speech.RecognitionAudio(content=content)
            text = speech_to_text(config, audio)
            all_text.append(text)
        except Exception as e:
            print(e)
            text = "No Audio"
            all_text.append(text)
    transcript = ""
    for i, t in enumerate(all_text):
        total_seconds = i * 15
        m, s = divmod(total_seconds, 60)
        h, m = divmod(m, 60)

        total_seconds_n = total_seconds + 15
        m_n, s_n = divmod(total_seconds_n, 60)
        h_n, m_n = divmod(m_n, 60)

        transcript = transcript + "{}\n{:0>2d}:{:0>2d}:{:0>2d},000 --> {:0>2d}:{:0>2d}:{:0>2d},000\n {}\n\n".format(
            i + 1, h, m, s, h_n, m_n, s_n, t)
        print("Transcript completed- " + str(transcript))
    transcript_file = str(folder) + "/" + str(pack) + ".srt"
    with open(transcript_file, "w") as f:
        f.write(transcript)
    def google_STT(self, path):
        client = speech_v1.SpeechClient.from_service_account_json(
            '/data/second-conquest-293723-05738e995f8f.json')

        # Loads the audio into memory
        with io.open(path, "rb") as audio_file:
            content = audio_file.read()
            audio = speech_v1.RecognitionAudio(content=content)

        encoding = speech_v1.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED

        config = speech_v1.RecognitionConfig(
            encoding=encoding,
            sample_rate_hertz=22050,
            language_code="en-US",
            audio_channel_count=2 if path.endswith('.wav') else 1,
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
        )

        # Detects speech in the audio file
        start = time.time()
        response = client.recognize(request={"config": config, "audio": audio})

        sent = ''
        text = []
        end_word = []
        sent_start_time = [
            0,
        ]

        for result in response.results:
            alternative = result.alternatives[0]
            # print("Transcript: {}".format(alternative.transcript))
            # print("Confidence: {}".format(alternative.confidence))

            seperate_sentence = sent_tokenize(alternative.transcript)
            # for i in range(len(seperate_sentence)):
            #     end_word.append(re.split(" ",text)[-1])

            for word_info in alternative.words:
                word = word_info.word
                # condition = re.search('\\.', word) | re.search('\\?', word)
                start_time = word_info.start_time
                end_time = word_info.end_time

                sent = sent + ' ' + word

                if (re.search('\\.', word) != None) | (re.search('\\?', word)
                                                       != None):
                    sent_start_time.append(end_time.total_seconds())
                    text.append(sent)
                    print(sent)
                    sent = ''
                    print(
                        f"Word: {word}, end_time: {end_time.total_seconds()}")

            print(sent_start_time)
            print(text)

        return text, sent_start_time
Ejemplo n.º 12
0
def speech_to_text(config, audio):
    client = speech.SpeechClient()
    response = client.recognize(config=config, audio=audio)
    print_sentences(response)


def print_sentences(response):
    for result in response.results:
        best_alternative = result.alternatives[0]
        transcript = best_alternative.transcript
        confidence = best_alternative.confidence
        print("-" * 60)
        print(f"Transcript: {transcript}")
        print(f"Confidence: {confidence:.0%}")


config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    language_code="pl-PL",
    # sample_rate_hertz=16000,
)

# audio_set = ['adres', 'pesel', 'nrrej']
# audio_set = ['adres']
audio_set = ['nrrej_desktop']
for wav in audio_set:
    audio = dict(uri=f"gs://tts1_magro/{wav}")
    speech_to_text(config, audio)
"""pyscript"""
# https://console.cloud.google.com/welcome?project=text-to-speech-349604