def _create_audio_data(raw_data, source):
     """
     Constructs an AudioData instance with the same parameters
     as the source and the specified frame_data
     """
     return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
Exemple #2
0
 def get_audio_data_after(self):
     byte_data = self.silence_data + self.audio.frame_data[self.end:self.
                                                           audio_size]
     return AudioData(byte_data, self.audio.sample_rate,
                      self.audio.sample_width)
Exemple #3
0
    def listen(self, source, timeout=None):
        """
        Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.

        This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included.

        The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely.
        """
        assert isinstance(source,
                          AudioSource), "Source must be an audio source"
        assert self.pause_threshold >= self.non_speaking_duration >= 0

        seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
        pause_buffer_count = int(
            math.ceil(self.pause_threshold / seconds_per_buffer)
        )  # number of buffers of non-speaking audio before the phrase is complete
        phrase_buffer_count = int(
            math.ceil(self.phrase_threshold / seconds_per_buffer)
        )  # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
        non_speaking_buffer_count = int(
            math.ceil(self.non_speaking_duration / seconds_per_buffer)
        )  # maximum number of buffers of non-speaking audio to retain before and after

        # read audio input for phrases until there is a phrase that is long enough
        elapsed_time = 0  # number of seconds of audio read
        while True:
            frames = collections.deque()

            # store audio input until the phrase starts
            while True:
                elapsed_time += seconds_per_buffer
                if timeout and elapsed_time > timeout:  # handle timeout if specified
                    raise WaitTimeoutError("listening timed out")

                buffer = source.stream.read(source.CHUNK)
                if len(buffer) == 0: break  # reached end of the stream
                frames.append(buffer)
                if len(
                        frames
                ) > non_speaking_buffer_count:  # ensure we only keep the needed amount of non-speaking buffers
                    frames.popleft()

                # detect whether speaking has started on audio input
                energy = audioop.rms(
                    buffer, source.SAMPLE_WIDTH)  # energy of the audio signal
                if energy > self.energy_threshold: break

                # dynamically adjust the energy threshold using assymmetric weighted average
                # do not adjust dynamic energy level for this sample if it is muted audio (energy == 0)
                self.adjust_energy_threshold(energy, seconds_per_buffer)
            # read audio input until the phrase ends
            pause_count, phrase_count = 0, 0
            while True:
                elapsed_time += seconds_per_buffer

                buffer = source.stream.read(source.CHUNK)
                if len(buffer) == 0: break  # reached end of the stream
                frames.append(buffer)
                phrase_count += 1

                # check if speaking has stopped for longer than the pause threshold on the audio input
                energy = audioop.rms(
                    buffer, source.SAMPLE_WIDTH)  # energy of the audio signal
                if energy > self.energy_threshold:
                    pause_count = 0
                else:
                    pause_count += 1
                if pause_count > pause_buffer_count:  # end of the phrase
                    break

                if len(frames
                       ) * seconds_per_buffer >= self.max_audio_length_sec:
                    # if we hit the end of the audio length, readjust energy_threshold
                    for frame in frames:
                        energy = audioop.rms(frame, source.SAMPLE_WIDTH)
                        self.adjust_energy_threshold(energy,
                                                     seconds_per_buffer)
                    break

            # check how long the detected phrase is, and retry listening if the phrase is too short
            phrase_count -= pause_count
            if phrase_count >= phrase_buffer_count:
                break  # phrase is long enough, stop listening

        # obtain frame data
        for i in range(pause_count - non_speaking_buffer_count):
            frames.pop()  # remove extra non-speaking frames at the end
        frame_data = b"".join(list(frames))

        return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
Exemple #4
0
 def get_audio_data_before(self):
     byte_data = self.audio.frame_data[0:self.begin] + self.silence_data
     return AudioData(byte_data, self.audio.sample_rate,
                      self.audio.sample_width)