def async_dictate(storage_uri, encoding, sample_rate_hertz, language_code): """ Transcribe long audio file from Cloud Storage using asynchronous speech recognition Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ client = speech_v1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' config = { "sample_rate_hertz": sample_rate_hertz, "language_code": language_code, "encoding": encoding, } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) logging.info("Waiting for dictation results...") response = operation.result() text = [] for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] text.append(alternative.transcript) logging.info("Dictation successful.") return text
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. # The language code you speak. language_code = 'th-TH' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) # Initial loop value rounds = 1 while True: try: print('streaming loop :' + str(rounds)) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() # Create request data requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) # POST data to google cloud speech responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses) except Exception as err: print(err) rounds += 1
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech_v1 from google.cloud.speech_v1 import enums from google.cloud.speech_v1 import types import io client = speech_v1.SpeechClient() # [START speech_python_migration_sync_request] # [START speech_python_migration_config] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='ja-JP') # [END speech_python_migration_config] # [START speech_python_migration_sync_response] response = client.recognize(config, audio) # [END speech_python_migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript))
def local_short_recognition(local_file_path): """This function transcribes a local short audio file using speech recognition. Parameters ---------- local_file_path : Path to local audio file, e.g. /path/audio.wav. Returns ------- """ set_path() client = speech_v1.SpeechClient() with io.open(local_file_path, "rb") as f: content = f.read() audio = {"content": content} response = client.recognize( { "language_code": "pt-BR", "sample_rate_hertz": 16000, "encoding": enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED }, audio) for result in response.results: alternative = result.alternatives[0] print(u"{}".format(alternative.transcript))
def transcribe_audio_to_text(local_file_path: str): """ Transcribe a short audio file using Google synchronous speech recognition """ client = speech_v1.SpeechClient() # The language of the supplied audio language_code = "en-US" encoding = enums.RecognitionConfig.AudioEncoding.FLAC config = { "language_code": language_code, "encoding": encoding, } with io.open(local_file_path, "rb") as f: content = f.read() audio = {"content": content} response = client.recognize(config, audio) for result in response.results: for alternative in result.alternatives: #Capture all transcription alternatives that Google created f = open('flacs-transcribed/' + local_file_path[6:-4] + 'txt', 'a') f.write("Transcript : " + alternative.transcript + " ") return
def sample_recognize(language_code, gcs_uri): """Transcribe audio file from Google Cloud Storage with word time offsets""" # [START speech_transcribe_async_word_time_offsets_gcs_core] client = speech_v1.SpeechClient() # language_code = 'en-US' # gcs_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' if isinstance(language_code, six.binary_type): language_code = language_code.decode('utf-8') if isinstance(gcs_uri, six.binary_type): gcs_uri = gcs_uri.decode('utf-8') sample_rate_hertz = 16000 encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 enable_word_time_offsets = True config = { 'sample_rate_hertz': sample_rate_hertz, 'language_code': language_code, 'encoding': encoding, 'enable_word_time_offsets': enable_word_time_offsets } audio = {'uri': gcs_uri} response = client.recognize(config, audio) for result in response.results: alternative = result.alternatives[0] print('Transcript: {}'.format(alternative.transcript)) for word_info in alternative.words: print('Word: {}'.format(word_info.word)) print('Word start time: {} seconds, {} nanos'.format( word_info.start_time.seconds, word_info.start_time.nanos)) print('Word end time: {} seconds, {} nanos'.format( word_info.end_time.seconds, word_info.end_time.nanos))
def long_running_recognize(args): """ Transcribe long audio file from Cloud Storage using asynchronous speech recognition Args: storage_uri URI for audio file in GCS, e.g. gs://[BUCKET]/[FILE] """ print("Transcribing {} ...".format(args.storage_uri)) client = speech_v1.SpeechClient() # Encoding of audio data sent. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "enable_word_time_offsets": True, "enable_automatic_punctuation": True, "sample_rate_hertz": args.sample_rate_hertz, "language_code": args.language_code, "encoding": encoding, } audio = {"uri": args.storage_uri} operation = client.long_running_recognize(config, audio) response = operation.result() subs = [] for result in response.results: # First alternative is the most probable result subs = break_sentences(args, subs, result.alternatives[0]) print("Transcribing finished") return subs
def sample_long_running_recognize(language_code, local_file_path): """Transcribe local audio file asynchronously""" # [START speech_transcribe_async_core] client = speech_v1.SpeechClient() # language_code = 'en-US' # local_file_path = 'Path to local audio file, e.g. /path/audio.wav' if isinstance(language_code, six.binary_type): language_code = language_code.decode('utf-8') if isinstance(local_file_path, six.binary_type): local_file_path = local_file_path.decode('utf-8') sample_rate_hertz = 16000 encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { 'sample_rate_hertz': sample_rate_hertz, 'language_code': language_code, 'encoding': encoding } with io.open(local_file_path, 'rb') as f: content = f.read() audio = {'content': content} operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result() for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript))
def recognize(local_file_path): client = speech_v1.SpeechClient() # The language of the supplied audio language_code = "ko-KR" # Sample rate in Hertz of the audio data sent sample_rate_hertz = 8000 # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. audio_channel_count = 1 encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, "encoding": encoding, "audio_channel_count": audio_channel_count, } with io.open(local_file_path, "rb") as f: content = f.read() audio = {"content": content} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() for result in response.results: alternatives = result.alternatives for alternative in alternatives: # alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript)) print(u"Confidence: {}".format(alternative.confidence))
def sample_recognize(storage_uri, model): """ Transcribe a short audio file from Cloud Storage using a specified transcription model Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] model The transcription model to use, e.g. video, phone_call, default For a list of available transcription models, see: https://cloud.google.com/speech-to-text/docs/transcription-model#transcription_models """ client = speech_v1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/hello.wav' # model = 'phone_call' # The language of the supplied audio language_code = "en-US" config = {"model": model, "language_code": language_code} audio = {"uri": storage_uri} response = client.recognize(config, audio) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript))
def __init__(self, language, supportedLanguages): # self.language = best_match(language, supportedLanguages)[0] self.language = "de" logging.info("created speech input for language: " + self.language) if (self.language == None or self.language == "und"): raise ValueError("Language is not supported") self.client = speech_v1.SpeechClient()
def transcriber(self) -> str: """ Transcribe a short audio file using asynchronous speech recognition Args: local_file_path Path to local audio file, e.g. /path/audio.wav """ client = speech_v1.SpeechClient() language_code = self.language_code # Sample rate in Hertz of the audio data sent sample_rate_hertz = self.sample_rate_herts # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.AMR config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, "encoding": encoding, } with io.open(self.audio_file.file_location.path, "rb") as f: content = f.read() audio = {"content": content} # response = client.recognize(config, audio) this is for the short files operation = client.long_running_recognize(config, audio) response = operation.result() content = [ result.alternatives[0].transcript for result in response.results ] return "".join(content)
def sample_recognize(storage_uri): """ Transcribe short audio file from Cloud Storage using synchronous speech recognition Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ client = speech_v1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' # Sample rate in Hertz of the audio data sent sample_rate_hertz = 16000 # The language of the supplied audio language_code = "en-US" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "sample_rate_hertz": sample_rate_hertz, "language_code": language_code, "encoding": encoding, } audio = {"uri": storage_uri} response = client.recognize(config, audio) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript))
def speech_to_text(local_file_path): client = speech_v1.SpeechClient() language_code = "en-US" sample_rate_hertz = 48000 config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, # "encoding": encoding, } with io.open(local_file_path, "rb") as f: content = f.read() audio = {"content": content} f = sf.SoundFile(local_file_path) aud_len = len(f) / f.samplerate if aud_len < 60: response = client.recognize(config, audio) else: dest_name = str(uuid.uuid4()) + '.wav' upload_blob(bucket_name="patched_video_output", source_file_name=local_file_path, destination_blob_name=dest_name) cloud_uri = 'gs://' + 'patched_video_output/' + dest_name print(cloud_uri) audio = {"uri": cloud_uri} operation = client.long_running_recognize(config, audio) response = operation.result() transcripted_text = [] for result in response.results: alternative = result.alternatives[0] transcripted_text.append(alternative.transcript) # print(u"Transcript: {}".format(alternative.transcript)) return transcripted_text
def sample_long_running_recognize(storage_uri): """ Transcribe long audio file from Cloud Storage using asynchronous speech recognition Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ client = speech_v1.SpeechClient() enable_word_time_offsets = True # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' # Sample rate in Hertz of the audio data sent # sample_rate_hertz = 16000 sample_rate_hertz = 48000 # The language of the supplied audio language_code = "en-US" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. # encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "sample_rate_hertz": sample_rate_hertz, "language_code": language_code, "enable_word_time_offsets": enable_word_time_offsets # "encoding": encoding, } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() response_dict = { "transcript": "", "word_timestamps": defaultdict(list), "video_url": "https://www.youtube.com/watch?v=PqMGmRhKsnM" } print("response", response.results) for result in response.results: alternative = result.alternatives[0] response_dict["transcript"] += alternative.transcript print(u"Transcript: {}".format(alternative.transcript)) # Print the start and end time of each word for word in alternative.words: print(u"Word: {}".format(word.word)) response_dict["word_timestamps"][word.word.lower()].append( word.start_time.seconds) print(u"Start time: {} seconds {} nanos".format( word.start_time.seconds, word.start_time.nanos)) print(u"End time: {} seconds {} nanos".format( word.end_time.seconds, word.end_time.nanos)) print(json.dumps(response_dict))
def stt(file, channel, hertz, languageCode): print('exec stt: ', file) client = speech_v1.SpeechClient() config = { "language_code": languageCode, "sample_rate_hertz": hertz, "audio_channel_count": channel } targetDir, targetFile = os.path.split(file) gcsURL = f'gs://{bucketName}/{targetFile}' audio = {"uri": gcsURL} try: # アウトプット用ファイル設定 outputFile = os.path.join(targetDir, 'STT_' + targetFile.replace('.wav', '.txt')) with open(outputFile, mode='w') as f: # STT実行 operation = client.long_running_recognize(config, audio) response = operation.result() for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] print(u'Transcript: {}'.format(alternative.transcript)) f.write('{}\n'.format(alternative.transcript)) print('done stt: ', file) print('#######') except Exception as e: print('stt() is ERROR:') print(e) print('#######')
def transcribe(local_file_path): """ Transcribe a short audio file using synchronous speech recognition Args: local_file_path Path to local audio file, e.g. /path/audio.wav """ client = speech_v1.SpeechClient() # local_file_path = 'resources/brooklyn_bridge.raw' # The language of the supplied audio language_code = "te-IN" # Sample rate in Hertz of the audio data sent sample_rate_hertz = 16000 # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, "encoding": encoding, } with io.open(local_file_path, "rb") as f: content = f.read() audio = {"content": content} response = client.recognize(config, audio) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] return (u"{}".format(alternative.transcript))
def sample_long_running_recognize(storage_uri): """This function transcribes a cloud long audio file using speech recognition. Parameters ---------- storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] Returns ------- """ set_path() client = speech_v1.SpeechClient() audio = {"uri": storage_uri} operation = client.long_running_recognize( { "sample_rate_hertz": 16000, "language_code": "pt-BR", "encoding": enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, }, audio) for result in operation.result().results: alternative = result.alternatives[0] print(u"{}".format(alternative.transcript))
def sample_long_running_recognize(storage_uri): """ Transcribe long audio file from Cloud Storage using asynchronous speech recognition Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ client = speech_v1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' # Sample rate in Hertz of the audio data sent sample_rate_hertz = 44100 # The language of the supplied audio language_code = "ko-KR" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. # encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "sample_rate_hertz": sample_rate_hertz, "language_code": language_code, "encoding": 'FLAC', "audio_channel_count": 2, "enable_word_time_offsets": True, } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() f = open("./data_output.txt", mode='wt') cf = open("./data_output.csv", 'w') wr = csv.writer(cf) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript)) for word in alternative.words: lst = [] print(u"Word: {}".format(word.word)) f.write(word.word) f.write(" ") lst.append(word.word) print(u"Start time: {} seconds {} nanos".format( word.start_time.seconds, word.start_time.nanos)) f.write(str(word.start_time.seconds)) lst.append(word.start_time.seconds) f.write('\n') print(u"End time: {} seconds {} nanos".format( word.end_time.seconds, word.end_time.nanos)) wr.writerow(lst) f.close() cf.close()
def speech2text(audio): client = speech.SpeechClient() try: response = client.recognize(config=config, audio=audio) sents = get_sentences(response) return sents except: return 'Not recognized'
def sample_recognize(file, config={}): if file == None: logger.error("please input target image file on first argument.") return None elif type(file) not in (str, FileStorage): logger.error( "please input target filename or FileStorage on first argument.") return None if config is None: logger.error("please set config argument") return None elif type(config) is not dict: logger.error("argument config need to dict type.") return None else: if config.get("language_code", None) is None: logger.error("please set config.language_code ") return None if config.get("sample_rate_hertz", None) is None: logger.error("please set config.sample_rate_hertz ") return None if config.get("max_alternatives", None) is None: logger.error("please set config.max_alternatives ") return None client = speech_v1.SpeechClient() # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 cfg = { "language_code": config.get("language_code", None), "sample_rate_hertz": config.get("sample_rate_hertz", None), "encoding": encoding, "max_alternatives": config.get("max_alternatives", None), } audio = None if type(file) is str: with io.open(file, "rb") as f: content = f.read() audio = {"content": content} elif type(file) is FileStorage: content = file.stream.read() audio = {"content": content} file.stream.close() strs = [] response = client.recognize(cfg, audio) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] logger.info(u"Transcript: {}".format(alternative.transcript)) strs.append(alternative.transcript) return strs
def sample_recognize(filename): """ Transcribe a short audio file using synchronous speech recognition Args: local_file_path Path to local audio file, e.g. /path/audio.wav """ if (filename.endswith(".MOV")): #or .avi, .mpeg, whatever. mp3_filename = filename[:-4] + ".mp3" flac_filename = filename[:-4] + ".flac" monoFlac_filename = "mono" + flac_filename #subprocess.call(['ffmpeg', '-i', filename, mp3_filename]) #subprocess.call(['ffmpeg', '-i', mp3_filename, '-f', 'flac', flac_filename]) #subprocess.call(['ffmpeg', '-i', flac_filename, '-ac', '1', monoFlac_filename]) os.system("ffmpeg -i " + filename + " " + mp3_filename + " &> /dev/null") os.system("ffmpeg -i " + mp3_filename + " -f flac " + flac_filename + " &> /dev/null") os.system("ffmpeg -i " + flac_filename + " -ac 1 " + monoFlac_filename + " &> /dev/null") else: pass if (filename.endswith(".flac") and filename[0:4] == "mono"): metadata = audio_metadata.load(filename) sample_frequency = metadata['streaminfo']['sample_rate'] else: sample_frequency = 44100 client = speech_v1.SpeechClient() # The language of the supplied audio language_code = "en-US" # Sample rate in Hertz of the audio data sent sample_rate_hertz = sample_frequency # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. #encoding = enums.RecognitionConfig.AudioEncoding.MP3 config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, "encoding": "FLAC", } with io.open(monoFlac_filename, "rb") as f: content = f.read() audio = {"content": content} response = client.recognize(config, audio) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] return "{}".format(alternative.transcript)
def sample_recognize(local_file_path, output_dir, language): """ Transcribe a short audio file using synchronous speech recognition Args: local_file_path Path to local audio file, e.g. /path/audio.wav """ filename = local_file_path.split('/')[-1][:-3] + 'txt' filepath_text = os.path.join(output_dir,filename) # if continue_skip and os.path.exists(filepath_text): # return client = speech_v1.SpeechClient() # local_file_path = 'resources/brooklyn_bridge.raw' # The language of the supplied audio language_code = language # Sample rate in Hertz of the audio data sent sample_rate_hertz = 16000 # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, "encoding": encoding, "audio_channel_count": 1, "enable_word_time_offsets": True, "enable_automatic_punctuation":False } with io.open(local_file_path, "rb") as f: content = f.read() audio = {"content": content} #print("Starting Call") response = client.recognize(config, audio) #print("Call ended") transcript = "" if response: if response.results and response.results[0]: if response.results[0].alternatives and response.results[0].alternatives[0]: transcript = response.results[0].alternatives[0].transcript with open(filepath_text, 'w+', encoding='utf8') as file: file.write(transcript)
def convert_mp3_to_speech(): latest_audio_file = _get_latest_audio_file() print("latest_audio_file: ", latest_audio_file) storage_uri = "gs://us-west3-video-enhancer-bb0ff304-bucket/audio_files/" + latest_audio_file print("storage_uri: ", storage_uri) client = speech_v1.SpeechClient() enable_word_time_offsets = True sample_rate_hertz = 48000 language_code = "en-US" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. # encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "sample_rate_hertz": sample_rate_hertz, "language_code": language_code, "enable_word_time_offsets": enable_word_time_offsets } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print("Waiting for operation to complete...") response = operation.result() response_dict = { "transcript": "", "word_timestamps": defaultdict(list), "video_url": YOUTUBE_URL } print("response from SpeechToText", response.results) verbose = False for result in response.results: alternative = result.alternatives[0] response_dict["transcript"] += alternative.transcript for word in alternative.words: response_dict["word_timestamps"][word.word.lower()].append( word.start_time.seconds) if verbose: print(u"Word: {}".format(word.word)) print(u"Start time: {} seconds {} nanos".format( word.start_time.seconds, word.start_time.nanos)) print(u"End time: {} seconds {} nanos".format( word.end_time.seconds, word.end_time.nanos)) json_data_for_search_indexing = json.dumps(response_dict) print("json_data_for_search_indexing: ", json_data_for_search_indexing) return json_data_for_search_indexing
def sample_long_running_recognize(request=''): bucket_name = 'visumm-store' # S2T client = speech_v1.SpeechClient() # read input arguments if request.args and 'input_filename' in request.args: input_filename = request.args.get('input_filename') print('got input_filename: ', input_filename) else: print('ERROR: no input_filename was provided. existing') return # input filepath as GCS uri storage_uri = 'gs://' + bucket_name + '/' + input_filename # S2T config sample_rate_hertz = 44100 language_code = "en-US" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.FLAC config = { "sample_rate_hertz": sample_rate_hertz, "language_code": language_code, "audio_channel_count": 2, "encoding": encoding, } audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() full_transcript = '' for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] full_transcript = full_transcript + alternative.transcript + '\n' print('full_transcript: \n', full_transcript) # write to file local_fpath = '/tmp/full_transcript.txt' with open(local_fpath, 'w') as f: f.write(full_transcript) # output filename to store output_filename = input_filename[0:-5] # remove the .flac extension output_filename = output_filename + '.txt' # add .txt extension upload_blob(local_fpath, output_filename) #sample_long_running_recognize()
def __init__(self): from google.cloud import speech_v1 from google.cloud.speech_v1 import enums self.client = speech_v1.SpeechClient() self.config = { 'model': 'video', 'language_code': 'en-US', 'sample_rate_hertz': 16000, 'encoding': enums.RecognitionConfig.AudioEncoding.LINEAR16, }
def sample_recognize(self, local_file_path, filename, path): client = speech_v1.SpeechClient() language_code = "en-US" enable_word_time_offsets = True use_enhanced = True encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 config = { "language_code": language_code, "encoding": encoding, "enable_word_time_offsets": enable_word_time_offsets, "use_enhanced": use_enhanced, } with io.open(local_file_path, "rb") as f: content = f.read() audio = {"content": content} swears = [ 'f**k', 'shit', 'ass', 'bitch', 'w***e', 'dick', 'fuk', "bitches", "sexual", "anus", "asshole", 'f*****g' ] response = client.recognize(config, audio) ts = [] for result in response.results: alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript)) for j in alternative.words: for key in swears: if key in j.word: start = j.start_time.nanos / (10** 9) + j.start_time.seconds end = j.end_time.nanos / (10**9) + j.end_time.seconds print(j.word, start, end) se = [start, end] ts.extend(se) print(ts) bl = '' for i in range(0, len(ts), 2): bl += 'between(t\,{0}\,{1})+'.format(ts[i], ts[i + 1]) if (len(bl) != 0): bl = bl[:-1] os.system( '''ffmpeg -i {0} -max_muxing_queue_size 1024 -c:v copy -af volume=0:enable='{1}' {2}''' .format(path + filename + ".mp4", bl, path + filename + "filtered" + ".mp4")) else: print(path, filename) os.chdir("/Users/VAISHNAVI/Desktop/mini/uploadedfiles") command = "copy {0} {1}".format(filename + ".mp4", filename + "filtered" + ".mp4") subprocess.call(command, shell=True)
def speechToText(speakerProfileId, storage_uri): print(speakerProfileId) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json" client = speech_v1.SpeechClient() sample_rate_hertz = 16000 language_code = "en-US" config = { "sample_rate_hertz": sample_rate_hertz, "language_code": language_code, } with io.open('/tmp/' + storage_uri, "rb") as f: content = f.read() audio = {"content": content} operation = client.long_running_recognize(config, audio) response = operation.result() transcript = "" for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] transcript += alternative.transcript fireStoreClient = firestore.Client() docReference = fireStoreClient.collection("enrolledUsers").document( speakerProfileId).get().to_dict() userName = docReference["name"] docReferenceScore = fireStoreClient.collection( 'performanceScore').document(str(userName)) importantWords = [ "welcome", "thank you", "sorry", "apologise", "apologize", "good day", "nice day", "good morning", "good evening", "good noon", "awesome", "sweet", "hope", "see you", "bye", "hello", "hi", "please", "sure", "sort", "sorted", "enjoy", "safe" ] wordsSpoken = dict() print(transcript) for word in importantWords: if word in transcript: wordsSpoken[word] = transcript.count(word) print(wordsSpoken) try: word_dct = docReferenceScore.get().to_dict() for word, freq in wordsSpoken.items(): if word in word_dct.keys(): new_freq = word_dct[word]["frequency"] + freq docReferenceScore.set({word: { "frequency": new_freq }}, merge=True) else: docReferenceScore.set({word: {"frequency": freq}}, merge=True) except: for word, freq in wordsSpoken.items(): docReferenceScore.set({word: {"frequency": freq}}, merge=True)
def sample_long_running_recognize(storage_path, save_path): """ Print start and end time of each word spoken in audio file from Cloud Storage Args: storage_path can be URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] or local file path """ client = speech_v1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.flac' # When enabled, the first result returned by the API will include a list # of words and the start and end time offsets (timestamps) for those words. enable_word_time_offsets = True # The language of the supplied audio language_code = "en-US" encoding = enums.RecognitionConfig.AudioEncoding.FLAC config = { "enable_word_time_offsets": enable_word_time_offsets, "language_code": language_code, "audio_channel_count": 2, # "sample_rate_hertz": 8000, "encoding": encoding, } if 'gs://' == storage_path[:5]: audio = {"uri": storage_path} else: with io.open(storage_path, "rb") as f: content = f.read() audio = {"content": content} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() name = storage_path.split('/')[-1].split('.')[0] with open(save_path + 'google_result_' + name + '.txt', 'w') as f: for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] f.write(u"Transcript: {}".format(alternative.transcript) + '\n') # Write the start and end time of each word for word in alternative.words: f.write(u"Word: {}".format(word.word) + '\n') f.write(u"Start time: {} seconds {} nanos".format( word.start_time.seconds, word.start_time.nanos) + '\n') f.write(u"End time: {} seconds {} nanos".format( word.end_time.seconds, word.end_time.nanos) + '\n') f.close() print('analysis finished')
def __init__(self): self.client = speech_v1.SpeechClient() self.uploader = Uploader() self.base_config = { "encoding": speech_v1.enums.RecognitionConfig.AudioEncoding.LINEAR16, "enable_word_time_offsets": True, "profanity_filter": False, "model": "default", "enable_automatic_punctuation": True }