Beispiel #1
0
    def record(self, audio_name: str, max_time: int):
        input(f"press any key to begin to record {max_time} seconds voice >> ")

        stream = PyAudio().open(format=paInt16,
                                channels=self.CHANNEL_NUM,
                                rate=self.SAMPLE_RATE,
                                input=True,
                                frames_per_buffer=self.SAMPLE_NUM)
        my_buf = []
        time_start = time.time()
        last_second = 0
        print(f"time: {last_second} s")
        while True:
            duration = time.time() - time_start
            if duration >= max_time:
                break
            if int(duration) != last_second:
                last_second = int(duration)
                print(f"time: {last_second} s")

            string_audio_data = stream.read(self.SAMPLE_NUM)
            my_buf.append(string_audio_data)

        stream.close()
        self._save_wave_file(audio_name, my_buf)
Beispiel #2
0
class Ava(AvaSkills):
    def __init__(self):
        super().__init__(self)
        self.interpreter = Interpreter.load(settings.RASA_MODEL_DIR)
        self.stream = PyAudio().open(format=paInt16,
                                     channels=1,
                                     rate=16000,
                                     input=True,
                                     frames_per_buffer=1024,
                                     output_device_index=0)
        self.config = pocketsphinx.Decoder.default_config()
        self.config.set_string(
            '-hmm', path.join(settings.SPHINX_MODEL_DIR, 'en-us/en-us'))
        self.config.set_string(
            '-dict',
            path.join(settings.SPHINX_MODEL_DIR, 'en-us/cmudict-en-us.dict'))
        self.config.set_string('-keyphrase', settings.WAKE_PHRASE)
        self.config.set_float('-kws_threshold', 1e+20)
        self.config.set_string('-logfn', 'text.log')
        self.decoder = pocketsphinx.Decoder(self.config)
        self.listen_for_wake()

    def listen_for_wake(self):
        self.stream.start_stream()
        self.decoder.start_utt()
        if (not self.play_mp3("startup_greeting.mp3")):
            self.get_tts(
                text=
                f"Hi, my name is Ava. If you need help, just say the wake command: {settings.WAKE_PHRASE}.",
                file_name="startup_greeting.mp3")
        print("Listening for wake word...")
        while True:
            buf = self.stream.read(1024)
            if buf:
                self.decoder.process_raw(buf, False, False)
            else:
                break
            if self.decoder.hyp() != None:
                print(f"Key phrase '{settings.WAKE_PHRASE}' detected...")
                if (not self.play_mp3("wake_chime.mp3")):
                    exit()

                if (not self.play_mp3("wake_greeting.mp3")):
                    self.get_tts(text="How can I help?",
                                 file_name="wake_greeting.mp3")
                self.listen_for_input()
                self.decoder.end_utt()
                print("Waiting for wakeup ...")
                self.decoder.start_utt()

    def get_tts(self, text, file_name, save=True):
        print("Converting text to speech...")
        polly_client = boto3.Session(
            aws_access_key_id=keys.POLLY_ACCESS_KEY_ID,
            aws_secret_access_key=keys.POLLY_SECRET_ACCESS_KEY,
            region_name='us-west-2').client('polly')

        response = polly_client.synthesize_speech(VoiceId='Joanna',
                                                  OutputFormat='pcm',
                                                  SampleRate="16000",
                                                  Text=text)
        data = response['AudioStream'].read()
        self.play_byte(data)
        if save:
            print("Saving to mp3 file...")
            AudioSegment(data=data,
                         sample_width=2,
                         frame_rate=16000,
                         channels=1).export(out_f=settings.MEDIA_DIR +
                                            file_name,
                                            format="mp3")

    def play_byte(self, stream):
        try:
            print("Playing byte stream...")
            play(
                AudioSegment(data=stream,
                             sample_width=2,
                             frame_rate=16000,
                             channels=1))
        except Exception as e:
            print("Error playing file...", e)

    def play_mp3(self, audio_file, save=True):
        file_path = settings.MEDIA_DIR + audio_file
        is_file = path.isfile(file_path)
        if is_file:
            try:
                print("Playing audio...")
                play(AudioSegment.from_mp3(file_path))
            except Exception as e:
                print("Error playing file...", e)
                return False
        else:
            print("Audio file not found...")
            return False
        return True

    def listen_for_input(self):
        sr = speech_recognition.Recognizer()
        mic = speech_recognition.Microphone()
        hyp = None
        with mic as source:
            sr.adjust_for_ambient_noise(source)
            try:
                print("Listening...")
                audio = sr.listen(source, timeout=2)
                print("Decoding...")
                hyp = sr.recognize_google(audio)
            except speech_recognition.WaitTimeoutError as e:
                print("No input detected timeout...")
            except speech_recognition.UnknownValueError as e:
                if (not self.play_mp3("decode_error.mp3")):
                    self.get_tts(
                        text=
                        "I'm sorry, but I was not able to understand that command.",
                        file_name="decode_error.mp3")
            except speech_recognition.RequestError as e:
                print("Google request error: ", e)
                print("Running backup decode Sphinx...")
                try:
                    hyp = sr.recognize_google(audio)
                except speech_recognition.UnknownValueError as e:
                    print("Sphinx recognition error: ", e)
                    if (not self.play_mp3("decode_error.mp3")):
                        self.get_tts(
                            text=
                            "I'm sorry, but I was not able to understand that command.",
                            file_name="decode_error.mp3")
            else:
                t0 = time()
                self.process_input_intent(hyp)
                print(f"Time to process command: {time() - t0}")

    def process_input_intent(self, hypothesis):
        print("Processing intent...")
        print(f"HYPOTHESIS:{hypothesis}")
        result = self.interpreter.parse(hypothesis)
        intent = result['intent']['name']
        confidence = result['intent']['confidence']
        print(f"INTENT: {intent}")
        print(f"CONFIDENCE_SCORE: {confidence}")
        try:
            print("Executing intent action...")
            response = getattr(self, intent)(result)
            print(f"RESULT: {response}")
            if (response):
                self.respond_intent_result(response)
        except Exception as e:
            print(f"Failed intent action...\n\tERROR: {e} ")

    def respond_intent_result(self, result):
        print(f"RESPONSE: {result['tts']}")
        if (not self.play_mp3(result['file'])):
            self.get_tts(text=result['tts'],
                         file_name=result['file'],
                         save=result['save'])
Beispiel #3
0
def audio_freq():

    # Значення крайніх нот у моєму випадку (шестиструнна гітара в строї Ре)

    note_min = 60           # Нота До 4-ї октави
    note_max = 71           # Нота Сі 4-ї октави

    sample_freq = 22050     # Частота кадру в герцах

    # Від збільшення цих констант залежить швидкість оновлення частоти.

    frame_size = 2048       # Кількість зразків у кадрі
    frames_per_fft = 16     # Кількість кадрів для середнього значення ШПФ

    samples_per_fft = frame_size * frames_per_fft   # Кількість зразків на ШПФ
    freq_step = sample_freq / samples_per_fft       # Крок частоти

    # Отримання мінімального та максимального показника для наших нот в межах ШПФ.

    def note_to_fftbin(n):
        return 440 * 2.0 ** ((n - 69) / 12.0) / freq_step

    imin = max(0, int(numpy.floor(note_to_fftbin(note_min - 1))))
    imax = min(samples_per_fft, int(numpy.ceil(note_to_fftbin(note_max + 1))))

    # Визначення простору для ШПФ.

    buf = numpy.zeros(samples_per_fft, dtype=numpy.float32)

    # Функція вікна Хеннінга.

    window = 0.5 * (1 - numpy.cos(numpy.linspace(0, 2*numpy.pi, samples_per_fft, False)))

    # Відкриваємо аудіо потік.

    stream = PyAudio().open(format=paInt16,
                            channels=1,
                            rate=sample_freq,
                            input=True,
                            frames_per_buffer=frame_size)

    stream.start_stream()

    # Отримуємо дані, поки потік відкритий.

    while stream.is_active():

        # Оновлюємо буфер та приймаємо нові дані.

        buf[:-frame_size] = buf[frame_size:]
        buf[-frame_size:] = numpy.fromstring(stream.read(frame_size), numpy.int16)

        # Запускаємо ШПФ в буфері в межах вікна.

        fft = numpy.fft.rfft(buf * window)

        # Отримуємо максимально повторювану частоту в діапазоні.

        freq = (numpy.abs(fft[imin:imax]).argmax() + imin) * freq_step

        # Запис відображення ноти (get_note) у файл freqs.txt

        freq_save(get_note(freq))

    # Правильно закриваємо аудіо потік.

    stream.stop_stream()
    stream.close()
    stream.terminate()
Beispiel #4
0
class SoundData:
    def __init__(self, chunk=1024, rate=44100):
        '''
        Initialize a SoundData object.

        Args:
            chunk (int) : number of samples grouped together
                          default: 1024
            rate  (int) : sampling frequency in Hz
                          default: 44100
        '''
        self.chunk = chunk
        self.rate = rate
        self.buffer = None
        self.audio_stream = PyAudio().open(
            format=
            paInt16,  # Create an audio stream object from the microphone using PyAudio
            channels=1,
            rate=rate,
            input=True,
            frames_per_buffer=chunk)

    def _write_stream_to_file(self, filename, data):
        '''
        Write contents of data to a Wave file.

        Args:
            filename  (str) : name of Wave file to be written to
            data     (list) : mono audio signal
        '''
        wave_file = wave.open(f'./assets/{filename}.wav',
                              'wb')  # Open the Wave file in binary write mode
        wave_file.setnchannels(1)  # Set details of the data being written
        wave_file.setsampwidth(PyAudio().get_sample_size(paInt16))
        wave_file.setframerate(self.rate)
        wave_file.writeframes(
            b''.join(data)
        )  # Convert the list into a binary string and (over)write to the Wave file
        wave_file.close()

    def _framing(self, data):
        '''
        Transform audio signal into a series of overlapping frames.
        A frame (sample) is the amplitude at a point in time.

        Args:
            data         (list) : mono audio signal

        Returns:
            frames       (list) : all the frames
            frame_length  (int) : length of each frame
        '''
        frame_length = int(
            .025 * self.rate
        )  # Frame length = (window length) * (rate), .025 secs chosen arbitrarily
        frame_step = int(
            .01 * self.rate
        )  # Used to convert from seconds to samples, .01 secs between windows chosen arbitrarily
        signal_length = len(data)
        number_of_frames = int(
            np.ceil(abs(signal_length - frame_length) /
                    frame_step))  # Check there is at least one frame

        # Find indices
        index_a = np.tile(
            np.arange(0, frame_length), (number_of_frames, 1)
        )  # numpy.arange(start,stop,step) returns evenly spaced values between start & stop
        # numpy.tile(array, repeats) constructs an array by repeating the given array in each given axis (repeats)
        index_b = np.tile(
            np.arange(0, number_of_frames * frame_step, frame_step),
            (frame_length, 1))
        index_b = np.transpose(
            index_b
        )  # Rearrange the array so rows become columns and colums become rows
        indices = index_a + index_b

        # Pad out the signal to ensure the frames have at least the same length as the indices array
        padding_amount = number_of_frames * frame_step + frame_length
        padding = np.zeros(
            (padding_amount -
             signal_length))  # Creates a numpy array filled entirely of zeros
        padded_buffer = np.append(data, padding)  # Merges two arrays into one

        frames = padded_buffer[indices.astype(
            np.int32, copy=False
        )]  # .astype(dtype, copy=False) changes the type of the indices array to int32
        return frames, frame_length

    def _get_dominant_frequency(self, frame):
        '''
        Find the dominant frequency of a single frame.

        Args:
            frame (numpy.ndarray) : amplitude information at a point in time
        
        Returns:
                          (float) : dominant frequency in Hz
        '''
        nfft = 2**14  # Fast fourier transform points to be calculated
        fourier_transform = np.fft.rfft(
            frame, nfft)  # Perform a fast fourier transform on a real input

        magnitude_spectrum = (1 / nfft) * abs(fourier_transform)
        power_spectrum = (1 / nfft)**2 * magnitude_spectrum**2

        frequencies = np.fft.fftfreq(
            len(power_spectrum), 1 / self.rate
        )  # Gives the frequencies associated with the coefficients: .fftfreq(window_length,sampling_spacing) where sampling_spacing is the inverse of sampling rate
        frequencies = (
            frequencies[np.where(frequencies >= 0)] // 2
        ) + 1  # Filter out negative frequencies and return the floor division of 2 for each frequency. Finally, add 1 to each frequency

        power_spectrum = power_spectrum[:len(
            frequencies
        )]  # Take only the first half of the spectra as only the first part contains useful data
        maxiumum_index = np.argmax(
            power_spectrum
        )  # .argmax() returns the maximum values along an axis

        return frequencies[
            maxiumum_index]  # Convert the dominant frequency to Hz

    def stream(self, time=.1):
        '''
        Update audio stream buffer.
        
        Args:
            time (float) : length of audio stream buffer in seconds
                           default: 0.1
        '''
        # To record (time) seconds into the buffer, we must take (rate)*(time) samples.
        # In each iteration (chunk) samples are taken, so we must loop (rate)*(time)/(chunk) times.
        buffer_hex = [
            self.audio_stream.read(self.chunk)
            for i in range(int(self.rate / self.chunk * time))
        ]
        self._write_stream_to_file('buffer', buffer_hex)
        self.rate, self.buffer = wavfile.read('./assets/buffer.wav')

    def get_dominant_frequencies(self):
        '''
        Analyse the buffer data to find the dominant frequencies.

        Returns:
            dominant_frequencies (list) : list of the dominant frequencies identified
        '''
        # Perform framing on the signal
        frames, frame_length = self._framing(self.buffer)
        # Perform Hamming window function on the frames
        windows = frames * np.hamming(
            frame_length
        )  # w(n) = .54 - .46*cos((2*(pi)*n)/(M-1)) , 0 <= n <= M-1 where M = number of points in the output window

        dominant_frequencies = np.array([
            self._get_dominant_frequency(window) for window in windows
        ])  # Find the dominant frequency for each frame
        dominant_frequencies = np.round(dominant_frequencies,
                                        3)  # Round to three decimal places
        dominant_frequencies = np.unique(
            dominant_frequencies)  # Remove all duplicate values

        return dominant_frequencies

    def get_note_from_frequency(self, notes_dict, frequencies):
        '''
        Convert a list of frequencies into their likeliest music note.
        
        Args:
            notes_dict  (dict) : dictionary of notes and their associated frequencies
            frequencies (list) : list of frequencies
            
        Returns:
            note         (str) : single note or None if no note identified   
        '''
        if 1.0 in frequencies.tolist():
            return 'rest'  # If 1.0 is a dominant frequency assume it is background noise
        for note in notes_dict.keys():
            target = notes_dict[note]
            weight = 0
            for freq in frequencies:
                min_distance_from_target = min([
                    abs(100 * round(
                        np.sin((np.pi / np.log(2)) * np.log(freq / value)), 4))
                    for value in target
                ])
                if not min_distance_from_target:
                    min_distance_from_target = -100
                weight += min_distance_from_target

            try:
                if weight < closest_match[1]:
                    closest_match = [note, weight]
            except NameError:  # On the first iteration closest_match has not yet been declared
                closest_match = [note, weight]
        return closest_match[0]