Esempio n. 1
0
def classify_audio(audio_device_index, interpreter, labels_file,
                   commands_file=None,
                   result_callback=None, dectection_callback=None,
                   sample_rate_hz=16000,
                   negative_threshold=0.6, num_frames_hop=33):
  """Acquire audio, preprocess, and classify."""
  # Initialize recorder.
  AUDIO_SAMPLE_RATE_HZ = sample_rate_hz
  downsample_factor = 1
  if AUDIO_SAMPLE_RATE_HZ == 48000:
    downsample_factor = 3
  # Most microphones support this
  # Because the model expects 16KHz audio, we downsample 3 fold
  recorder = audio_recorder.AudioRecorder(
      AUDIO_SAMPLE_RATE_HZ,
      downsample_factor=downsample_factor,
      device_index=audio_device_index)
  feature_extractor = Uint8LogMelFeatureExtractor(num_frames_hop=num_frames_hop)
  labels = read_labels(labels_file)
  if commands_file:
    commands = read_commands(commands_file)
  else:
    commands = {}
  logger.info("Loaded commands: %s", str(commands))
  logger.info("Recording")
  timed_out = False
  with recorder:
    last_detection = -1
    while not timed_out:
      try:
        spectrogram = feature_extractor.get_next_spectrogram(recorder)
        set_input(interpreter, spectrogram.flatten())
        interpreter.invoke()
        result = get_output(interpreter)
        if result_callback:
          result_callback(result, commands, labels)
        if dectection_callback:
          detection = -1
          if result[0] < negative_threshold:
            top3 = np.argsort(-result)[:3]
            for p in range(3):
              label = labels[top3[p]]
              if label not in commands.keys():
                continue
              if top3[p] and result[top3[p]] > commands[label]['conf']:
                if detection < 0:
                  detection = top3[p]
          if detection < 0 and last_detection > 0:
            print("---------------")
            last_detection = 0
          if labels[detection] in commands.keys() and detection != last_detection:
            print(labels[detection], commands[labels[detection]])
            dectection_callback(commands[labels[detection]]['key'])
            last_detection = detection
        if spectrogram.mean() < 0.001:
          print("Warning: Input audio signal is nearly 0. Mic may be off ?")
      except:
        print("crashing out not sure why")
        timed_out = True
        raise
Esempio n. 2
0
def classify_audio(model_file,
                   labels_file,
                   callback,
                   audio_device_index=0,
                   sample_rate_hz=16000,
                   negative_threshold=0.6,
                   num_frames_hop=33):
    """Acquire audio, preprocess, and classify."""
    downsample_factor = 1
    if sample_rate_hz == 48000:
        downsample_factor = 3
    # Most microphones support this
    # Because the model expects 16KHz audio, we downsample 3 fold
    recorder = audio_recorder.AudioRecorder(
        sample_rate_hz,
        downsample_factor=downsample_factor,
        device_index=audio_device_index)
    feature_extractor = Uint8LogMelFeatureExtractor(
        num_frames_hop=num_frames_hop)
    labels = read_labels(labels_file)

    interpreter = make_interpreter(model_file)
    interpreter.allocate_tensors()

    keep_listening = True
    prev_detection = -1
    with recorder:
        print("Ready for voice commands...")
        while keep_listening:
            spectrogram = feature_extractor.get_next_spectrogram(recorder)
            if spectrogram.mean() < 0.001:
                print(
                    "Warning: Input audio signal is nearly 0. Mic may be off ?"
                )

            set_input(interpreter, spectrogram.flatten())
            interpreter.invoke()
            result = get_output(interpreter)

            if result[0] >= negative_threshold:
                prev_detection = -1
                continue

            detection = np.argmax(result)
            if detection == 0:
                prev_detection = -1
                continue

            if detection != prev_detection:
                keep_listening = callback(labels[detection], result[detection])
                prev_detection = detection
Esempio n. 3
0
def main():
    microphone = audio_recorder.AudioRecorder(INPUT_DEVICE)
    process_audio_forever(microphone)
Esempio n. 4
0
def classify_audio(audio_device_index, interpreter, labels_file,
                   commands_file=None,
                   result_callback=None, dectection_callback=None,
                   sample_rate_hz=16000,
                   negative_threshold=0.6, num_frames_hop=33):
  """Acquire audio, preprocess, and classify."""
  # Initialize recorder.
  AUDIO_SAMPLE_RATE_HZ = sample_rate_hz
  downsample_factor = 1
  if AUDIO_SAMPLE_RATE_HZ == 48000:
    downsample_factor = 3
  # Most microphones support this
  # Because the model expects 16KHz audio, we downsample 3 fold
  recorder = audio_recorder.AudioRecorder(
      AUDIO_SAMPLE_RATE_HZ,
      downsample_factor=downsample_factor,
      device_index=audio_device_index)
  feature_extractor = Uint8LogMelFeatureExtractor(num_frames_hop=num_frames_hop)
  labels = read_labels(labels_file)
  if commands_file:
    commands = read_commands(commands_file)
  else:
    commands = {}
  logger.info("Loaded commands: %s", str(commands))
  logger.info("Recording")
  timed_out = False

  # Testing
  if False:
    sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav'

    import tensorflow as tf
    import os
    def decode_audio(audio_binary):
      audio, _ = tf.audio.decode_wav(audio_binary)
      return tf.squeeze(audio, axis=-1)

    def get_label(file_path):
      parts = tf.strings.split(file_path, os.path.sep)

      # Note: You'll use indexing here instead of tuple unpacking to enable this 
      # to work in a TensorFlow graph.
      return parts[-2]

    def get_waveform_and_label(file_path):
      label = get_label(file_path)
      audio_binary = tf.io.read_file(file_path)
      waveform = decode_audio(audio_binary)
      return waveform, label
    waveform, label = get_waveform_and_label(sample_data)
    print(waveform.shape)
  # End Testing

  with recorder:
    last_detection = -1
    while not timed_out:
      spectrogram = feature_extractor.get_next_spectrogram(recorder).astype('float32')
      #spectrogram = feature_extractor.compute_spectrogram_and_normalize(waveform.numpy()[:15680], 16000)
      # plot_spectrogram(spectrogram)
      spectrogram = np.expand_dims(spectrogram, axis=-1)
      spectrogram = np.expand_dims(spectrogram, axis=0)
      input_details = interpreter.get_input_details()
      interpreter.set_tensor(input_details[0]['index'], spectrogram)
      # set_input(interpreter, spectrogram.flatten())
      interpreter.invoke()
      result = get_output(interpreter)
      # NOTE: Add softmax
      # NOTE: Remove negative label
      result = softmax(result)
      #print(result)
      if result_callback:
        result_callback(result, commands, labels)
      if dectection_callback:
        detection = -1
        if result[0] < negative_threshold:
          top3 = np.argsort(-result)[:3]
          for p in range(3):
            label = labels[top3[p]]
            if label not in commands.keys():
              continue
            if top3[p] and result[top3[p]] > commands[label]['conf']:
              if detection < 0:
                detection = top3[p]
        if detection < 0 and last_detection > 0:
          print("---------------")
          last_detection = 0
        if labels[detection] in commands.keys() and detection != last_detection:
          print(labels[detection], commands[labels[detection]])
          dectection_callback(commands[labels[detection]]['key'])
          last_detection = detection
      if spectrogram.mean() < 0.001:
        print("Warning: Input audio signal is nearly 0. Mic may be off ?")
Esempio n. 5
0
def classify_audio(audio_device_index,
                   interpreter,
                   labels_file,
                   commands_file=None,
                   result_callback=None,
                   dectection_callback=None,
                   sample_rate_hz=16000,
                   negative_threshold=0.6,
                   num_frames_hop=33):
    """Acquire audio, preprocess, and classify."""
    # Initialize recorder.
    AUDIO_SAMPLE_RATE_HZ = sample_rate_hz
    downsample_factor = 1
    if AUDIO_SAMPLE_RATE_HZ == 48000:
        downsample_factor = 3
    # Most microphones support this
    # Because the model expects 16KHz audio, we downsample 3 fold
    recorder = audio_recorder.AudioRecorder(
        AUDIO_SAMPLE_RATE_HZ,
        downsample_factor=downsample_factor,
        device_index=audio_device_index)
    feature_extractor = Uint8LogMelFeatureExtractor(
        num_frames_hop=num_frames_hop)
    labels = read_labels(labels_file)
    if commands_file:
        commands = read_commands(commands_file)
    else:
        commands = {}
    logger.info("Loaded commands: %s", str(commands))
    logger.info("Recording")
    timed_out = False

    # Testing
    if False:
        sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav'

        import tensorflow as tf
        import os

        def decode_audio(audio_binary):
            audio, _ = tf.audio.decode_wav(audio_binary)
            return tf.squeeze(audio, axis=-1)

        def get_label(file_path):
            parts = tf.strings.split(file_path, os.path.sep)

            # Note: You'll use indexing here instead of tuple unpacking to enable this
            # to work in a TensorFlow graph.
            return parts[-2]

        def get_waveform_and_label(file_path):
            label = get_label(file_path)
            audio_binary = tf.io.read_file(file_path)
            waveform = decode_audio(audio_binary)
            return waveform, label

        waveform, label = get_waveform_and_label(sample_data)
        print(waveform.shape)
    # End Testing

    # yamnet start testing
    import os
    import soundfile as sf
    import params as yamnet_params
    import yamnet as yamnet_model
    from scipy.io import wavfile
    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    if not os.path.exists('yamnet.h5'):
        print(
            'Error: curl -O https://storage.googleapis.com/audioset/yamnet.h5')
        exit()
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    import pygame
    pygame.init()
    screen = pygame.display.set_mode((640, 480))
    font_header = pygame.font.Font(pygame.font.get_default_font(), 36)
    font = pygame.font.Font(pygame.font.get_default_font(), 36 * 2)

    text_surface = font.render('Hello world', True, (0, 0, 0))
    GRAY = (200, 200, 200)
    # yamnet end testing
    with recorder:
        last_detection = -1
        while not timed_out:
            audio_sample = recorder.get_audio(7921)[0]
            if False:
                wavfile.write('test.wav', 16000, audio_sample)
                wav_data, sr = sf.read('test.wav', dtype=np.int16)
            else:
                wav_data = np.array(audio_sample, dtype=np.int16)
                sr = AUDIO_SAMPLE_RATE_HZ
            assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
            waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
            waveform = waveform.astype('float32')
            # Convert to mono and the sample rate expected by YAMNet.
            if len(waveform.shape) > 1:
                waveform = np.mean(waveform, axis=1)
            if sr != params.sample_rate:
                waveform = resampy.resample(waveform, sr, params.sample_rate)
            print('-------')
            # Predict YAMNet classes.
            scores, embeddings, spectrogram = yamnet(waveform)
            # Scores is a matrix of (time_frames, num_classes) classifier scores.
            # Average them along time to get an overall classifier output for the clip.
            prediction = np.mean(scores, axis=0)
            # Report the highest-scoring classes and their scores.
            top5_i = np.argsort(prediction)[::-1][:5]
            print(':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))
            print('{}:{:.3f}'.format(yamnet_classes[42], prediction[42]))
            print('{}:{:.3f}'.format(yamnet_classes[0], prediction[0]))
            print('{}:{:.3f}'.format(yamnet_classes[494], prediction[494]))

            target_predictions = prediction[42], prediction[0], prediction[494]
            target_classes = yamnet_classes[42], yamnet_classes[
                0], yamnet_classes[494]
            index = np.argsort(target_predictions)[::-1][0]
            black = (0, 0, 0)
            green = (0, 255, 0)
            red = (255, 0, 0)
            if index == 0:
                color = red
            elif index == 1:
                color = green
            else:
                color = black
            text1 = font.render(target_classes[index], True, color)
            header1 = font_header.render('R-zero Device Listening for Audio',
                                         True, (0, 0, 0))
            screen.fill(GRAY)
            screen.blit(header1, dest=(20, 100))
            screen.blit(text1, dest=(200, 200))
            pygame.display.update()
            '''
      line = '{}:{:.3f}'.format(yamnet_classes[42], prediction[42])
      label = Tk.Label(None, text=line, font=('Times', '18'), fg='blue')
      label.pack()
      label.mainloop()
      '''
            # End
            """