def _create_audio_data(raw_data, source): """ Constructs an AudioData instance with the same parameters as the source and the specified frame_data """ return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def get_audio_data_after(self): byte_data = self.silence_data + self.audio.frame_data[self.end:self. audio_size] return AudioData(byte_data, self.audio.sample_rate, self.audio.sample_width)
def listen(self, source, timeout=None): """ Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns. This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included. The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely. """ assert isinstance(source, AudioSource), "Source must be an audio source" assert self.pause_threshold >= self.non_speaking_duration >= 0 seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE pause_buffer_count = int( math.ceil(self.pause_threshold / seconds_per_buffer) ) # number of buffers of non-speaking audio before the phrase is complete phrase_buffer_count = int( math.ceil(self.phrase_threshold / seconds_per_buffer) ) # minimum number of buffers of speaking audio before we consider the speaking audio a phrase non_speaking_buffer_count = int( math.ceil(self.non_speaking_duration / seconds_per_buffer) ) # maximum number of buffers of non-speaking audio to retain before and after # read audio input for phrases until there is a phrase that is long enough elapsed_time = 0 # number of seconds of audio read while True: frames = collections.deque() # store audio input until the phrase starts while True: elapsed_time += seconds_per_buffer if timeout and elapsed_time > timeout: # handle timeout if specified raise WaitTimeoutError("listening timed out") buffer = source.stream.read(source.CHUNK) if len(buffer) == 0: break # reached end of the stream frames.append(buffer) if len( frames ) > non_speaking_buffer_count: # ensure we only keep the needed amount of non-speaking buffers frames.popleft() # detect whether speaking has started on audio input energy = audioop.rms( buffer, source.SAMPLE_WIDTH) # energy of the audio signal if energy > self.energy_threshold: break # dynamically adjust the energy threshold using assymmetric weighted average # do not adjust dynamic energy level for this sample if it is muted audio (energy == 0) self.adjust_energy_threshold(energy, seconds_per_buffer) # read audio input until the phrase ends pause_count, phrase_count = 0, 0 while True: elapsed_time += seconds_per_buffer buffer = source.stream.read(source.CHUNK) if len(buffer) == 0: break # reached end of the stream frames.append(buffer) phrase_count += 1 # check if speaking has stopped for longer than the pause threshold on the audio input energy = audioop.rms( buffer, source.SAMPLE_WIDTH) # energy of the audio signal if energy > self.energy_threshold: pause_count = 0 else: pause_count += 1 if pause_count > pause_buffer_count: # end of the phrase break if len(frames ) * seconds_per_buffer >= self.max_audio_length_sec: # if we hit the end of the audio length, readjust energy_threshold for frame in frames: energy = audioop.rms(frame, source.SAMPLE_WIDTH) self.adjust_energy_threshold(energy, seconds_per_buffer) break # check how long the detected phrase is, and retry listening if the phrase is too short phrase_count -= pause_count if phrase_count >= phrase_buffer_count: break # phrase is long enough, stop listening # obtain frame data for i in range(pause_count - non_speaking_buffer_count): frames.pop() # remove extra non-speaking frames at the end frame_data = b"".join(list(frames)) return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def get_audio_data_before(self): byte_data = self.audio.frame_data[0:self.begin] + self.silence_data return AudioData(byte_data, self.audio.sample_rate, self.audio.sample_width)