class Translator: def __init__(self): self.client = SpeechClient() storage_client = storage.Client() self.bucket_name = 'cross-culture-audios' self.bucket = storage_client.get_bucket(self.bucket_name) def translate_long(self, gs_uri): audio = types.RecognitionAudio(uri=gs_uri, ) config = types.RecognitionConfig( encoding='FLAC', language_code='en-US', sample_rate_hertz=44100, ) operation = self.client.long_running_recognize(config=config, audio=audio) op_result = operation.result() result = '\n'.join([ str.strip(result.alternatives[0].transcript) for result in op_result.results if len(result.alternatives) > 0 ]) return result def translate_with_timestamps(self, gs_uri): audio = types.RecognitionAudio(uri=gs_uri, ) config = types.RecognitionConfig( encoding='FLAC', language_code='en-US', # sample_rate_hertz=44100, enable_word_time_offsets=True) operation = self.client.long_running_recognize(config=config, audio=audio) results = [] for result in operation.result().results: alternatives = result.alternatives if len(alternatives) == 0: continue alternative = alternatives[0] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time.seconds + round( word_info.start_time.nanos * 1e-9, 1) end_time = word_info.end_time.seconds + round( word_info.end_time.nanos * 1e-9, 1) results.append([word, start_time, end_time]) return results def upload_to_gcs(self, filepath): filename = ntpath.basename(filepath) gs_filepath = 'audios/%s' % filename blob = self.bucket.blob(gs_filepath) blob.upload_from_filename(filepath) return self.generate_uri(gs_filepath) def delete_from_gcs(self, filename): gs_filepath = 'audios/%s' % filename self.bucket.delete_blob(gs_filepath) def generate_uri(self, filepath): return 'gs://%s/%s' % (self.bucket_name, filepath)
def recognize_audio_from_file( file: Union[str, os.PathLike], credential: Union[str, os.PathLike, None] = None, language_code: str = 'en-US', encoding: enums.RecognitionConfig.AudioEncoding = enums.RecognitionConfig. AudioEncoding.FLAC, sampling_rate_hertz: int = 44100, ) -> types.RecognizeResponse: """ Args: file (str, os.PathLike) : credential (str) : language_code (str) : encoding (str) : sampling_rate_hertz (int) : Returns: types.RecognizeResponse """ if credential is None: client = SpeechClient() else: credentials = Credentials.from_service_account_file( filename=credential) client = SpeechClient(credentials=credentials) config = types.RecognitionConfig(encoding=encoding, language_code=language_code, sampling_rate_hertz=sampling_rate_hertz) with io.open(file, 'rb') as audio: content = audio.read() audio = types.RecognitionAudio(content=content) return client.recognize(config, audio)
def __init__(self, credential: Union[str, os.PathLike, None] = None): """ Args: credential (str, os.PathLike, None) : """ if credential is None: self.client = SpeechClient() else: credentials = Credentials.from_service_account_file( filename=credential) self.client = SpeechClient(credentials=credentials)
def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(self._credentialsFile) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.AudioServer.SAMPLERATE, language_code=self.LanguageManager.getLanguageAndCountryCode() ) self._streamingConfig = types.StreamingRecognitionConfig(config=config, interim_results=True)
def transcribe_gcs(self, gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri. args: gcs_uri - URI with format 'gs://<bucket>/<path_to_audio>' returns: trans - a list of transcribed sections """ printmsg.begin('Initiating Google Cloud Speech operation') client = SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code='en-GB', enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) printmsg.end() printmsg.begin('Waiting for operation to complete [0%%]') while not operation.done(): time.sleep(1) printmsg.begin('Waiting for operation to complete [%s%%]' % operation.metadata.progress_percent) response = operation.result(timeout=10) printmsg.end() # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. trans = [] for result in response.results: # The first alternative is the most likely one for this portion. best = result.alternatives[0] get_ts = lambda x: dict(min=x.seconds // 60, sec=x.seconds % 60, msec=x.nanos // (10**6)) seg = dict(text=best.transcript, confidence=best.confidence, words=[]) # loop the words for word_info in best.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time word_obj = dict(word=word, tstamp=get_ts(start_time)) seg['words'].append(word_obj) trans.append(seg) return trans
def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str( Path(self.Commons.rootDir(), 'credentials/googlecredentials.json')) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.ConfigManager.getAliceConfigByName( 'micSampleRate'), language_code=self.LanguageManager.activeLanguageAndCountryCode) self._streamingConfig = types.StreamingRecognitionConfig( config=config, interim_results=True)
def VoiceRecognition(b_voice_data): print("VR: initialized") try: client = SpeechClient() print("VR: preparing recognition request") audio = types.RecognitionAudio(content=b_voice_data) config = types.RecognitionConfig( # setup default Telegram format encoding=enums.RecognitionConfig.AudioEncoding.OGG_OPUS, sample_rate_hertz=16000, language_code='en-US', max_alternatives=0) # Recognize speech content print("VR: call for Google Speech API") try: response = client.recognize(config, audio) print("VR: GCS API call finished") print(response) if (response.results): for result in response.results: rec_voice = result.alternatives[0].transcript return rec_voice else: print("VR: GCS API returned NULL") rec_voice = "NDVR" return rec_voice except Exception as apiClientExpt: print( "VR: FATAL ERROR: unhandled exception when calling recognize API" ) print(apiClientExpt) return False except Exception as speechClientExpt: print( "VR: FATAL ERROR: unhandled exception when initializing SpeechClient" ) print(speechClientExpt) return False
def __init__(self): global SpeechClient, types, enums, Credentials from google.cloud.speech import SpeechClient, types, enums from google.oauth2.service_account import Credentials super(GoogleCloudStreamingSTT, self).__init__() # override language with module specific language selection self.language = self.config.get('lang') or self.lang credentials = Credentials.from_service_account_info( self.credential.get('json')) self.client = SpeechClient(credentials=credentials) recognition_config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code=self.language, model='command_and_search', max_alternatives=1, ) self.streaming_config = types.StreamingRecognitionConfig( config=recognition_config, interim_results=True, single_utterance=True, )
def proof_of_concept(): config = RecognitionConfig(encoding=RecognitionConfig.AudioEncoding.FLAC, language_code="en-UK", audio_channel_count=2) audio = RecognitionAudio(uri='gs://general-rodderscode-co-uk/test.flac') response = SpeechClient().recognize(config=config, audio=audio) print(response)
def __transcribe_chunk(self, async_iter): """ Accesses Google Cloud Speech and print the lyrics for each chunk """ frame_rate, encoding, file_path = async_iter accuracy_chunk_path = append_before_ext(file_path, '-accuracy') with open(accuracy_chunk_path, 'rb') as audio_content: content = audio_content.read() config = self.__get_config(encoding, frame_rate) audio = types.RecognitionAudio(content=content) return SpeechClient().recognize(config, audio)
def get_raw(file_name: str, client: speech.SpeechClient) -> str: """ Get the raw Speech to text result from Google Cloud API :param file_name: File name + path :param client: Google Cloud API Speech client :return: str JSON encoded response """ audio = types.RecognitionAudio(uri=file_name) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code="de-DE", enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) response = operation.result(timeout=900) return MessageToJson(response)
def recognize_stream(bytestream: Generator[ByteString, None, None], client: speech.SpeechClient, recognition_config: types.RecognitionConfig, q: Queue): """Streams transcription of the given audio file.""" requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in bytestream) responses = client.streaming_recognize( get_streaming_recognition_config(recognition_config), requests) while True: try: resp = next(responses) logger.debug("reading next response; resp.results is {}".format( resp.results)) except StopIteration: logger.info("no more responses!") break if resp.results: final = [x for x in resp.results if x.is_final] if final: q.put(final[0].alternatives[0]) logger.info("exit from recognize_stream!") return
def recognize_audio_from_uri( uri: str, credential: Union[str, os.PathLike, None] = None, language_code: str = 'en-US', encoding: enums.RecognitionConfig.AudioEncoding = enums.RecognitionConfig. AudioEncoding.FLAC, sampling_rate_hertz: int = 44100, ) -> types.RecognizeResponse: """ Args: uri (str) : Cloud credential (str, os.PathLike, None) : language_code: encoding (enums.RecognitionConfig.AudioEncoding) : sampling_rate_hertz (int) : Returns: types.RecognizeResponse """ if credential is None: client = SpeechClient() else: credentials = Credentials.from_service_account_file( filename=credential) client = SpeechClient(credentials=credentials) config = types.RecognitionConfig(encoding=encoding, language_code=language_code, sample_rate_hertz=sampling_rate_hertz) audio = types.RecognitionAudio(uri=uri) try: result = client.recognize(config=config, audio=audio) except exceptions.InvalidArgument: print( 'cannot synchronize recognition. switched asynchronized recognition' ) operartion = client.long_running_recognize(config=config, audio=audio) result = operartion.result() return result
from flask import Flask, request, render_template from google.cloud.speech import enums, types, SpeechClient import json import os import traceback from parse_command import parse_command app = Flask(__name__) os.environ.setdefault('GOOGLE_APPLICATION_CREDENTIALS', 'key.json') client = SpeechClient() @app.route('/upload', methods=['POST']) def upload(): raw_audio = request.files['audio_data'] user_agent = request.headers.get('User-Agent') content = raw_audio.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, audio_channel_count=2, language_code='en-US', ) response = client.recognize(config, audio) print(response) for result in response.results: voice_command = result.alternatives[0].transcript
from google.cloud import firestore from google.cloud.vision import ImageAnnotatorClient from google.cloud.speech import SpeechClient, RecognitionAudio, RecognitionConfig from google.cloud import language from google.cloud.language import enums from google.cloud.language import types service_account_name = './service_account.json' db = firestore.Client.from_service_account_json(service_account_name) vision_client = ImageAnnotatorClient.from_service_account_json(service_account_name) speech_client = SpeechClient.from_service_account_json(service_account_name) language_client = language.LanguageServiceClient.from_service_account_json(service_account_name) def max_window(): return 60 def database(): return db def vision(): return vision_client def speech(): return speech_client, RecognitionAudio, RecognitionConfig def language(text): document = types.Document(content = text, type = enums.Document.Type.PLAIN_TEXT) return language_client, document
class SpeechToText: def __init__(self, credential: Union[str, os.PathLike, None] = None): """ Args: credential (str, os.PathLike, None) : """ if credential is None: self.client = SpeechClient() else: credentials = Credentials.from_service_account_file( filename=credential) self.client = SpeechClient(credentials=credentials) def recognize_from_uri( self, uri: str, encoding: enums.RecognitionConfig.AudioEncoding = enums. RecognitionConfig.AudioEncoding.FLAC, language_code: str = 'en-US', sampling_rate_hertz: int = 44100) -> types.RecognizeResponse: """ Args: uri (str) : encoding (enums.RecognitionConfig.AudioEncoding) : language_code (str) : sampling_rate_hertz (int) : Returns: types.RecognizeResponse """ config = types.RecognitionConfig( encoding=encoding, language_code=language_code, sampling_rate_hertz=sampling_rate_hertz) audio = types.RecognitionAudio(uri=uri) return self.client.recognize(config, audio) def recognize_from_file( self, file: Union[str, os.PathLike], encoding: enums.RecognitionConfig.AudioEncoding = enums. RecognitionConfig.AudioEncoding.FLAC, language_code: str = 'en-US', sampling_rate_hertz: int = 44100) -> types.RecognizeResponse: """ Args: file (str, os.PathLike) : encoding (enums.RecognitionConfig.AudioEncoding) : language_code (str) : sampling_rate_hertz (int) : Returns: types.RecognizeResponse """ config = types.RecognitionConfig( encoding=encoding, language_code=language_code, sampling_rate_hertz=sampling_rate_hertz) with io.open(file, 'rb') as audio: content = audio.read() audio = types.RecognitionAudio(content=content) return self.client.recognize(config, audio)
def __init__(self): self.client = SpeechClient() storage_client = storage.Client() self.bucket_name = 'cross-culture-audios' self.bucket = storage_client.get_bucket(self.bucket_name)
class GoogleAsr(Asr): NAME = 'Google Asr' DEPENDENCIES = { 'system': [], 'pip' : { 'google-cloud-speech==1.3.1' } } def __init__(self): super().__init__() self._credentialsFile = Path(self.Commons.rootDir(), 'credentials/googlecredentials.json') self._capableOfArbitraryCapture = True self._isOnlineASR = True self._client: Optional[SpeechClient] = None self._streamingConfig: Optional[types.StreamingRecognitionConfig] = None if self._credentialsFile.exists() and not self.ConfigManager.getAliceConfigByName('googleASRCredentials'): self.ConfigManager.updateAliceConfiguration(key='googleASRCredentials', value=self._credentialsFile.read_text(), doPreAndPostProcessing=False) self._internetLostFlag = Event() # Set if internet goes down, cut the decoding self._lastResultCheck = 0 # The time the intermediate results were last checked. If actual time is greater than this value + 3, stop processing, internet issues self._previousCapture = '' # The text that was last captured in the iteration def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(self._credentialsFile) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.AudioServer.SAMPLERATE, language_code=self.LanguageManager.getLanguageAndCountryCode() ) self._streamingConfig = types.StreamingRecognitionConfig(config=config, interim_results=True) def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) recorder = Recorder(self._timeout, session.user, session.deviceUid) self.ASRManager.addRecorder(session.deviceUid, recorder) self._recorder = recorder result = None with Stopwatch() as processingTime: with recorder as stream: audioStream = stream.audioStream() # noinspection PyUnresolvedReferences try: requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audioStream) responses = self._client.streaming_recognize(self._streamingConfig, requests) result = self._checkResponses(session, responses) except Exception as e: self._internetLostFlag.clear() self.logWarning(f'Failed ASR request: {e}') self.end() return ASRResult( text=result[0], session=session, likelihood=result[1], processingTime=processingTime.time ) if result else None def onInternetLost(self): self._internetLostFlag.set() def _checkResponses(self, session: DialogSession, responses: Generator) -> Optional[tuple]: if responses is None: return None for response in responses: if self._internetLostFlag.is_set(): self.logDebug('Internet connectivity lost during ASR decoding') if not response.results: raise Exception('Internet connectivity lost during decoding') result = response.results[0] return result.alternatives[0].transcript, result.alternatives[0].confidence if not response.results: continue result = response.results[0] if not result.alternatives: continue if result.is_final: return result.alternatives[0].transcript, result.alternatives[0].confidence elif result.alternatives[0].transcript != self._previousCapture: self.partialTextCaptured(session=session, text=result.alternatives[0].transcript, likelihood=result.alternatives[0].confidence, seconds=0) self._previousCapture = result.alternatives[0].transcript elif result.alternatives[0].transcript == self._previousCapture: now = int(time()) if self._lastResultCheck == 0: self._lastResultCheck = 0 continue if now > self._lastResultCheck + 3: self.logDebug(f'Stopping process as there seems to be connectivity issues') return result.alternatives[0].transcript, result.alternatives[0].confidence self._lastResultCheck = now return None
help="connect to unity", default=False) parser.add_argument("--lang_code", type=str, help="the language code of your language", default="zh-tw") args = parser.parse_args() if args.connect: address = ('127.0.0.1', 5067) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(address) else: sock = None client = SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=args.lang_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) print("%s recognition started!" % args.lang_code) while True: with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) try: responses = client.streaming_recognize(streaming_config,
class GoogleAsr(Asr): NAME = 'Google Asr' DEPENDENCIES = {'system': [], 'pip': {'google-cloud-speech==1.3.1'}} def __init__(self): super().__init__() self._capableOfArbitraryCapture = True self._isOnlineASR = True self._client: Optional[SpeechClient] = None self._streamingConfig: Optional[ types.StreamingRecognitionConfig] = None self._previousCapture = '' def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str( Path(self.Commons.rootDir(), 'credentials/googlecredentials.json')) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.AudioServer.SAMPLERATE, language_code=self.LanguageManager.getLanguageAndCountryCode()) self._streamingConfig = types.StreamingRecognitionConfig( config=config, interim_results=True) def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) recorder = Recorder(self._timeout, session.user, session.siteId) self.ASRManager.addRecorder(session.siteId, recorder) self._recorder = recorder with Stopwatch() as processingTime: with recorder as stream: audioStream = stream.audioStream() # noinspection PyUnresolvedReferences try: requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audioStream) responses = self._client.streaming_recognize( self._streamingConfig, requests) result = self._checkResponses(session, responses) except: self.logWarning('Failed ASR request') self.end() return ASRResult( text=result[0], session=session, likelihood=result[1], processingTime=processingTime.time) if result else None def _checkResponses(self, session: DialogSession, responses: Generator) -> Optional[tuple]: if responses is None: return None for response in responses: if not response.results: continue result = response.results[0] if not result.alternatives: continue if result.is_final: return result.alternatives[0].transcript, result.alternatives[ 0].confidence elif result.alternatives[0].transcript != self._previousCapture: self.partialTextCaptured( session=session, text=result.alternatives[0].transcript, likelihood=result.alternatives[0].confidence, seconds=0) self._previousCapture = result.alternatives[0].transcript return None
class GoogleAsr(Asr): NAME = 'Google Asr' DEPENDENCIES = {'system': [], 'pip': {'google-cloud-speech==1.3.1'}} def __init__(self): super().__init__() self._capableOfArbitraryCapture = True self._isOnlineASR = True self._client: Optional[SpeechClient] = None self._streamingConfig: Optional[ types.StreamingRecognitionConfig] = None self._internetLostFlag = Event( ) # Set if internet goes down, cut the decoding self._lastResultCheck = 0 # The time the intermediate results were last checked. If actual time is greater than this value + 3, stop processing, internet issues self._previousCapture = '' # The text that was last captured in the iteration self._delayedGoogleConfirmation = False # set whether slow internet is detected or not def onStart(self): super().onStart() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str( Path(self.Commons.rootDir(), 'credentials/googlecredentials.json')) self._client = SpeechClient() # noinspection PyUnresolvedReferences config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=self.AudioServer.SAMPLERATE, language_code=self.LanguageManager.getLanguageAndCountryCode()) self._streamingConfig = types.StreamingRecognitionConfig( config=config, interim_results=True) def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) recorder = Recorder(self._timeout, session.user, session.siteId) self.ASRManager.addRecorder(session.siteId, recorder) self._recorder = recorder result = None with Stopwatch() as processingTime: with recorder as stream: audioStream = stream.audioStream() # noinspection PyUnresolvedReferences try: requests = (types.StreamingRecognizeRequest( audio_content=content) for content in audioStream) responses = self._client.streaming_recognize( self._streamingConfig, requests) result = self._checkResponses(session, responses) except: self._internetLostFlag.clear() self.logWarning('Failed ASR request') self.end() return ASRResult( text=result[0], session=session, likelihood=result[1], processingTime=processingTime.time) if result else None def onInternetLost(self): self._internetLostFlag.set() def _checkResponses(self, session: DialogSession, responses: Generator) -> Optional[tuple]: if responses is None: return None for response in responses: if self._internetLostFlag.is_set(): self.logDebug('Internet connectivity lost during ASR decoding') if not response.results: raise Exception( 'Internet connectivity lost during decoding') result = response.results[0] return result.alternatives[0].transcript, result.alternatives[ 0].confidence if not response.results: continue result = response.results[0] if not result.alternatives: continue if result.is_final: self._lastResultCheck = 0 self._delayedGoogleConfirmation = False # print(f'Text confirmed by Google') return result.alternatives[0].transcript, result.alternatives[ 0].confidence elif result.alternatives[0].transcript != self._previousCapture: self.partialTextCaptured( session=session, text=result.alternatives[0].transcript, likelihood=result.alternatives[0].confidence, seconds=0) # below function captures the "potential" full utterance not just one word from it if len(self._previousCapture) <= len( result.alternatives[0].transcript): self._previousCapture = result.alternatives[0].transcript elif result.alternatives[0].transcript == self._previousCapture: # If we are here it's cause google hasn't responded yet with confirmation on captured text # Store the time in seconds since epoch now = int(time()) # Set a reference to nows time plus 3 seconds self._lastResultCheck = now + 3 # wait 3 seconds and see if google responds if not self._delayedGoogleConfirmation: # print(f'Text of "{self._previousCapture}" captured but not confirmed by GoogleASR yet') while now <= self._lastResultCheck: now = int(time()) self._delayedGoogleConfirmation = True # Give google the option to still process the utterance continue # During next iteration, If google hasn't responded in 3 seconds assume intent is correct if self._delayedGoogleConfirmation: self.logDebug( f'Stopping process as there seems to be connectivity issues' ) self._lastResultCheck = 0 self._delayedGoogleConfirmation = False return result.alternatives[ 0].transcript, result.alternatives[0].confidence return None