def yamnet_grad_test():
    waveform = np.reshape(
        np.sin(2 * np.pi * 440 * np.linspace(0, 3, num=int(3 * 16000))),
        [1, -1])

    print(waveform[0])
    wavfile.write('sine.wav', 16000, waveform[0])
    model = yamnet_frames_model(params)
    model.load_weights('yamnet.h5')
    classes = class_names('yamnet_class_map.csv')

    with tf.GradientTape() as grad_tape:
        audio_tensor = tf.convert_to_tensor(np.reshape(waveform, [1, -1]))
        print(f'Audio Tensor is: {type(audio_tensor)}')
        grad_tape.watch(audio_tensor)
        # scores, spectrograms = model.predict(audio_tensor, steps=1)
        scores, spectrograms = model(audio_tensor)
        print(f'Scores is: {type(scores)}')

        target_scores = scores.numpy()
        assert target_scores.shape == scores.shape
        target_scores[:, 0] = 1
        target_scores = tf.convert_to_tensor(target_scores)

        loss = tf.keras.losses.MSE(target_scores, scores)

    gradient_tensor = grad_tape.gradient(loss, audio_tensor)
    print(scores[0])
    print(classes[np.argsort(scores[0])[-3:]])
    print(gradient_tensor.shape)
    print(audio_tensor.shape)

    output_tensor = audio_tensor + 1000 * gradient_tensor
    wavfile.write('speechy.wav', 16000, output_tensor[0].numpy())
    wavfile.write('grad.wav', 16000, 1000 * gradient_tensor[0].numpy())
Beispiel #2
0
def main(argv):
    assert argv, 'Usage: inference.py <wav file> <wav file> ...'

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)

        # Predict YAMNet classes.
        scores, embeddings, spectrogram = yamnet(waveform)
        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores, axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))
def check_model(model_fn, class_map_path, params):
  yamnet_classes = yamnet.class_names(class_map_path)

  """Applies yamnet_test's sanity checks to an instance of YAMNet."""
  def clip_test(waveform, expected_class_name, top_n=10):
    predictions, embeddings, log_mel_spectrogram = model_fn(waveform)
    clip_predictions = np.mean(predictions, axis=0)
    top_n_indices = np.argsort(clip_predictions)[-top_n:]
    top_n_scores = clip_predictions[top_n_indices]
    top_n_class_names = yamnet_classes[top_n_indices]
    top_n_predictions = list(zip(top_n_class_names, top_n_scores))
    assert expected_class_name in top_n_class_names, (
        'Did not find expected class {} in top {} predictions: {}'.format(
            expected_class_name, top_n, top_n_predictions))

  clip_test(
      waveform=np.zeros((int(3 * params.sample_rate),), dtype=np.float32),
      expected_class_name='Silence')

  np.random.seed(51773)  # Ensure repeatability.
  clip_test(
      waveform=np.random.uniform(-1.0, +1.0,
                                 (int(3 * params.sample_rate),)).astype(np.float32),
      expected_class_name='White noise')

  clip_test(
      waveform=np.sin(2 * np.pi * 440 *
                      np.arange(0, 3, 1 / params.sample_rate), dtype=np.float32),
      expected_class_name='Sine wave')
Beispiel #4
0
def main(argv):
  assert argv

  graph = tf.Graph()
  with graph.as_default():
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
  yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

  for file_name in argv:
    # Decode the WAV file.
    wav_data, sr = sf.read(file_name, dtype=np.int16)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]

    # Convert to mono and the sample rate expected by YAMNet.
    if len(waveform.shape) > 1:
      waveform = np.mean(waveform, axis=1)
    if sr != params.SAMPLE_RATE:
      waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE)

    # Predict YAMNet classes.
    # Second output is log-mel-spectrogram array (used for visualizations).
    # (steps=1 is a work around for Keras batching limitations.)
    with graph.as_default():
      scores, _ = yamnet.predict(np.reshape(waveform, [1, -1]), steps=1)
    # Scores is a matrix of (time_frames, num_classes) classifier scores.
    # Average them along time to get an overall classifier output for the clip.
    prediction = np.mean(scores, axis=0)
    # Report the highest-scoring classes and their scores.
    top5_i = np.argsort(prediction)[::-1][:5]
    print(file_name, ':\n' + 
          '\n'.join('  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                    for i in top5_i))
    def embedding(self, input_paths, output_paths, embed_paths=""):
        """Extract YAMnet features with opensmile using a single process."""
        if embed_paths == "":
            embed_paths = [""] * len(input_paths)
            save_embedding = False
        else:
            save_embedding = True

        paths = list(zip(input_paths, embed_paths, output_paths))

        params = yamnet_params.Params(sample_rate=self.sample_rate,
                                      patch_hop_seconds=0.48)

        class_names = yamnet_model.class_names(self.class_names)
        yamnet = yamnet_model.yamnet_frames_model(params)
        yamnet.load_weights(self.model_checkpoint)

        func = partial(
            self._embed,
            yamnet=yamnet,
            params=params,
            class_names=class_names,
            save_embedding=save_embedding,
        )

        self.single_process(func, paths)
Beispiel #6
0
 def setUpClass(cls):
     super(YAMNetTest, cls).setUpClass()
     cls._yamnet_graph = tf.Graph()
     with cls._yamnet_graph.as_default():
         cls._yamnet = yamnet.yamnet_frames_model(params)
         cls._yamnet.load_weights('yamnet.h5')
         cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
Beispiel #7
0
 def __init__(self):
     physical_devices = tf.config.experimental.list_physical_devices('GPU')
     tf.config.experimental.set_virtual_device_configuration(
         physical_devices[0], [
             tf.config.experimental.VirtualDeviceConfiguration(
                 memory_limit=4096)
         ])
     self.graph = tf.Graph()
     with self.graph.as_default():
         self.yamnet = yamnet_model.yamnet_frames_model(params)
         self.yamnet.load_weights('yamnet/yamnet.h5')
     self.yamnet_classes = yamnet_model.class_names(
         'yamnet/yamnet_class_map.csv')
def main(argv):
    assert argv

    model = tf.saved_model.load('model')

    metadata_fn = model.signatures["metadata"]
    metadata = metadata_fn()
    print('metadata', metadata)

    score_fn = model.signatures["score"]
    print(score_fn)

    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    print(yamnet_classes)

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            waveform = np.mean(waveform, axis=1)
        if sr != params.SAMPLE_RATE:
            waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE)

        waveform = tf.expand_dims(
            tf.expand_dims(tf.constant(waveform, dtype=tf.float32), 0), 2)

        scores = next(
            iter(
                score_fn(
                    waveform=waveform,
                    context_step_samples=tf.constant(int(
                        params.PATCH_HOP_SECONDS * params.SAMPLE_RATE),
                                                     dtype=tf.int64),
                ).values())).numpy()

        print(scores)

        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores[0], axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.5f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))
Beispiel #9
0
    def __init__(self, config_path="./config.yaml"):
        """Init method for the Searcher."""
        super().__init__()
        # Load the configuration
        conf = OmegaConf.load(config_path)
        self.dataset_path = conf.dataset_path
        self.audio_path = os.path.join(conf.dataset_path, "podcasts-audio")

        self.es_url = conf.search_es_url  # URL of Elasticsearch to query
        self.es_num = (conf.search_es_num
                       )  # Number of segments to request from Elasticsearch
        self.sample_rate = 44100  # Hardcoded sample rate of all podcast audio

        # Load the podcast metadata
        self.metadata = load_metadata(self.dataset_path)

        # Set up the reranking model
        self.rerank_tokenizer = AutoTokenizer.from_pretrained(
            conf.search_rerank_model,
            use_fast=True,
            cache_dir=conf.search_cache_dir)
        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(
            conf.search_rerank_model, cache_dir=conf.search_cache_dir)
        self.rerank_model.to("cpu", non_blocking=True)
        self.rerank_max_seq_len = 512

        # Set up the openSMILE extractor
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
            options={
                "frameModeFunctionalsConf":
                os.path.join(
                    os.getenv("PODCAST_PATH"),
                    "data/custom_FrameModeFunctionals.conf.inc",
                )
            },
        )

        # Set up the YAMNet model
        params = yamnet_params.Params(sample_rate=self.sample_rate,
                                      patch_hop_seconds=0.48)
        self.yamnet_classes = yamnet_model.class_names(
            os.path.join(os.getenv("YAMNET_PATH"), "yamnet_class_map.csv"))
        self.yamnet_model = yamnet_model.yamnet_frames_model(params)
        self.yamnet_model.load_weights(
            os.path.join(os.getenv("PODCAST_PATH"), "data/yamnet.h5"))
def main(argv):
  assert argv, 'Usage: inference.py <wav file> <wav file> ...'

  model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet.h5')
  classes_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet_class_map.csv')
  event_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'event.json')
  
  params = yamnet_params.Params()
  yamnet = yamnet_model.yamnet_frames_model(params)
  yamnet.load_weights(model_path)
  yamnet_classes = yamnet_model.class_names(classes_path)

  for file_name in argv:
    # Decode the WAV file.
    wav_data, sr = sf.read(file_name, dtype=np.int16)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    waveform = waveform.astype('float32')

    # Convert to mono and the sample rate expected by YAMNet.
    if len(waveform.shape) > 1:
      waveform = np.mean(waveform, axis=1)
    if sr != params.sample_rate:
      waveform = resampy.resample(waveform, sr, params.sample_rate)

    # Predict YAMNet classes.
    scores, embeddings, spectrogram = yamnet(waveform)
    # Scores is a matrix of (time_frames, num_classes) classifier scores.
    # Average them along time to get an overall classifier output for the clip.
    prediction = np.mean(scores, axis=0)
    # Report the highest-scoring classes and their scores.
    top5_i = np.argsort(prediction)[::-1][:5]
    print(file_name, ':\n' +
          '\n'.join('  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                    for i in top5_i))
    
    # print all classes
    b = prediction.tolist() # nested lists with same data, indices
    pred = []
    for (i,cls) in enumerate(yamnet_classes):
      item={}
      item['label']=cls
      item['value']=round(b[i], 6)
      pred.append(item)
    pred = sorted(pred, key=lambda x: x['value'], reverse=True)
    json.dump(pred, codecs.open(event_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4) ### this saves the array in .json format
Beispiel #11
0
def main(argv):
    global analysisdata, frame_counter
    log = open('/tmp/sound.log', 'w')
    # Set up yamnet
    params = yamnet_params.Params(sample_rate=ANALYSIS_SAMPLE_RATE,
                                  patch_hop_seconds=0.1)
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('/home/pi/models/research/audioset/yamnet/yamnet.h5')
    yamnet_classes = yamnet_model.class_names(
        '/home/pi/models/research/audioset/yamnet/yamnet_class_map.csv')
    # Set up a live callback stream from the microphone
    stream = sd.InputStream(device=1,
                            channels=1,
                            samplerate=RECORD_SAMPLE_RATE,
                            callback=audio_callback,
                            blocksize=BUFFER_SIZE_F)
    with stream:
        while True:
            update_analysis_window()
            if (frame_counter >= int(
                    ANALYSIS_LENGTH_S * ANALYSIS_SAMPLE_RATE)):
                frame_counter = 0
                scores = yamnet.predict(analysisdata, steps=1)[0]
                if (len(scores)):
                    prediction = np.mean(scores, axis=0)
                    top5_i = np.argsort(prediction)[::-1][:1]
                    for x in top5_i:
                        if (prediction[x] > THRESHOLD):
                            top_class_str = yamnet_classes[x]
                            # Write any detected class (outside these noisy ones) to the log
                            if (not top_class_str in [
                                    "Fireworks", "Silence",
                                    "Inside, small room"
                            ]):
                                log.write("[%s] %s %0.4f\n" %
                                          (datetime.now().strftime(
                                              "%m/%d/%Y %H:%M:%S"),
                                           top_class_str, prediction[x]))
                                log.flush()
                                # And if it's one of the doorbell ones, ping the homebridge server
                                if (top_class_str in [
                                        "Beep, bleep", "Doorbell", "Glass",
                                        "Ding"
                                ]):
                                    trigger_homekit_motion()
def main():

    # Load yamnet
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')

    # Convert the model
    class_names = [
        re.sub(r'\ |\(|\)|,|-|\'', '', x.lower())
        for x in yamnet_model.class_names('yamnet_class_map.csv')
    ]

    frame = RfcxFrame(yamnet, params.SAMPLE_RATE, params.PATCH_WINDOW_SECONDS,
                      class_names, 'pcm_s16le')
    tf.saved_model.save(frame,
                        'model',
                        signatures={
                            "score": frame.score,
                            "metadata": frame.metadata
                        })
Beispiel #13
0
    def load_model(self, layer=None):
        """
        This function loads the yamnet model with a specified layer and
        returns a 'dreamer' model that returns the activations of such layer
        
        Parameters
        -----------
        layer (string) : a specified layer

        If `layer` is not specified, the last layer is used instead.

        Returns
        ----------
        (tf.keras.Model) : the dreamer model
        
        """

        # load its class names
        self.class_names = yamnet.class_names(self.class_file)
        self.class_names_tensor = tf.constant(self.class_names)
        # load model parameters and get model
        self.params = params.Params(sample_rate=self.sr,
                                    patch_hop_seconds=self.patch_hop)
        self.model = yamnet.yamnet_frames_model(self.params)
        # load model weigths
        self.model.load_weights(self.weights_file)
        if layer is not None:
            self.layername = layer
        else:
            self.__print__("Using last layer.")
            self.layername = self.model.layers[-1].name
        self.__print__(f"Yamnet loaded, using layer:{self.layername}")
        # Get the specified layer
        self.layers = self.model.get_layer(self.layername).output
        # Finally, create the dreamer model
        self.dreamer = tf.keras.Model(inputs=self.model.input,
                                      outputs=self.layers)
        self.__print__("Dreamer started.")
        return self.dreamer
def main(argv):
    assert argv

    # Load the TFLite model and allocate tensors.
    interpreter = tf.lite.Interpreter(model_path="yamnet.tflite")
    interpreter.allocate_tensors()
    inputs = interpreter.get_input_details()
    outputs = interpreter.get_output_details()

    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            waveform = np.mean(waveform, axis=1)
        if sr != params.SAMPLE_RATE:
            waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE)

        # Predict YAMNet classes.
        interpreter.set_tensor(
            inputs[0]['index'],
            np.expand_dims(np.array(waveform, dtype=np.float32), axis=0))
        interpreter.invoke()
        scores = interpreter.get_tensor(outputs[0]['index'])

        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores, axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))
Beispiel #15
0
    'Liquid',
    'Water',
]  # 'Water', 'Pour', 'Drip'
waterKeys = ['Water tap, faucet', 'Sink (filling or washing)']
signals = dict.fromkeys(keys, 0.0)
picked = dict.fromkeys(keys, 0.0)
detected = dict.fromkeys(keys, False)
detectThreshold = 0.65
checkThreshold = 0.25
resetThreshold = 0.05

# Set up the YAMNet model.
params.PATCH_HOP_SECONDS = 0.48  # 10 Hz scores frame rate. //0.1
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet.h5')
class_names = yamnet_model.class_names('yamnet_class_map.csv')

CHUNKSIZE = 16000  # fixed chunk size
sr = 16000
seconds = 1
predictionPeriod = 2.0
predictionRate = 2.0
predChunkSize = int(sr * predictionPeriod)
readChunkSize = int(sr * predictionRate)

duration = 50

frames = []
last5secFrames = []
old5secFrames = []
Beispiel #16
0
def main(argv):

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

        print('waveform original dtaa', wav_data.shape)

        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')
        print('waveform normal dtaa', waveform.shape)

        print('sampling rate', sr)
        print('sampling rate model params', params.sample_rate)

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            print('entered')
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)

        print(waveform.shape, min(waveform))
        # plt.figure(figsize=(20, 8))
        # plt.plot(waveform)
        # plt.xlabel('Samples')
        # plt.ylabel('Amplitude')
        # # plt.savefig('waveform.png')
        # plt.show()
        # plt.close()

        # fig, ax = plt.subplots(figsize=(20, 8))
        fig = plt.figure()
        ax = plt.axes(xlim=(0, len(waveform)), ylim=(-0.16, 0.17))

        line, = ax.plot([], [], lw=1)

        def init():
            line.set_data([], [])
            return line,

        def animate(i):
            x = np.linspace(0, len(waveform), len(waveform))
            y = waveform[i]
            line.set_data(x, y)
            return line,

        anim = FuncAnimation(fig,
                             animate,
                             init_func=init,
                             frames=200,
                             interval=20,
                             blit=True)

        plt.draw()
        plt.show()
Beispiel #17
0
with open('credentials.txt', 'r') as file:
    credentials = json.loads(file.read())

Baby_Crying = False
##Account Variables
account_sid = credentials['account_sid']
auth_token = credentials['auth_token']
messaging_service_sid = credentials['messaging_service_sid']


##Get Tensorflow Model 
graph = tf.Graph()
with graph.as_default():
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('models/research/audioset/yamnet/yamnet.h5')
    yamnet_classes = yamnet_model.class_names('models/research/audioset/yamnet/yamnet_class_map.csv')

##Set Paremeters for PyAudio
RATE=44100
RECORD_SECONDS = 5
CHUNKSIZE = 4096
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNKSIZE)

def get_audio_stream():
    stream.start_stream()
    frames = [] # A python-list of chunks(numpy.ndarray)
    print('\n***************\n***************\n***recording***\n***************\n***************\n')
    for _ in range(0, int(RATE / CHUNKSIZE * RECORD_SECONDS)):
        data = stream.read(CHUNKSIZE)
        frames.append(numpy.fromstring(data, dtype=numpy.int16))
ip = input_pipeline.InputPipeline(batch_size=32, buffer_size=100)
ip.setup_paths(paths)
ip.setup_labels_cough(labels_cough)
ip.make_datasets()

import params
import yamnet as yamnet_model
import importlib
importlib.reload(yamnet_model)
import tflite_compat
importlib.reload(tflite_compat)

params.BATCH_SIZE = ip.batch_size
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights(paths['yamnet_weights'])
yamnet_classes = yamnet_model.class_names(paths['yamnet_classes'])

embeddings = {}
# This loops in a slighly stupid way, because the yamnet
# requires fixed length and fixed batch size - for tflite
# compatibility. So, it repeats until all ids have randomly
# not been dropped in making fixed batch_sizes. Also, since
# the length has to match the 10s of audioset, the FSD data
# is either croppoed to 10s or padded with zeros.
for ds in ['fsd']:
    embeddings[ds] = {}
    print(f'{ds}')
    for split in ['train', 'test', 'val']:
        print(f'\t{split}')
        embeddings[ds][split] = {}
        n_processed = 0
Beispiel #19
0
def classify_audio(audio_device_index,
                   interpreter,
                   labels_file,
                   commands_file=None,
                   result_callback=None,
                   dectection_callback=None,
                   sample_rate_hz=16000,
                   negative_threshold=0.6,
                   num_frames_hop=33):
    """Acquire audio, preprocess, and classify."""
    # Initialize recorder.
    AUDIO_SAMPLE_RATE_HZ = sample_rate_hz
    downsample_factor = 1
    if AUDIO_SAMPLE_RATE_HZ == 48000:
        downsample_factor = 3
    # Most microphones support this
    # Because the model expects 16KHz audio, we downsample 3 fold
    recorder = audio_recorder.AudioRecorder(
        AUDIO_SAMPLE_RATE_HZ,
        downsample_factor=downsample_factor,
        device_index=audio_device_index)
    feature_extractor = Uint8LogMelFeatureExtractor(
        num_frames_hop=num_frames_hop)
    labels = read_labels(labels_file)
    if commands_file:
        commands = read_commands(commands_file)
    else:
        commands = {}
    logger.info("Loaded commands: %s", str(commands))
    logger.info("Recording")
    timed_out = False

    # Testing
    if False:
        sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav'

        import tensorflow as tf
        import os

        def decode_audio(audio_binary):
            audio, _ = tf.audio.decode_wav(audio_binary)
            return tf.squeeze(audio, axis=-1)

        def get_label(file_path):
            parts = tf.strings.split(file_path, os.path.sep)

            # Note: You'll use indexing here instead of tuple unpacking to enable this
            # to work in a TensorFlow graph.
            return parts[-2]

        def get_waveform_and_label(file_path):
            label = get_label(file_path)
            audio_binary = tf.io.read_file(file_path)
            waveform = decode_audio(audio_binary)
            return waveform, label

        waveform, label = get_waveform_and_label(sample_data)
        print(waveform.shape)
    # End Testing

    # yamnet start testing
    import os
    import soundfile as sf
    import params as yamnet_params
    import yamnet as yamnet_model
    from scipy.io import wavfile
    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    if not os.path.exists('yamnet.h5'):
        print(
            'Error: curl -O https://storage.googleapis.com/audioset/yamnet.h5')
        exit()
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    import pygame
    pygame.init()
    screen = pygame.display.set_mode((640, 480))
    font_header = pygame.font.Font(pygame.font.get_default_font(), 36)
    font = pygame.font.Font(pygame.font.get_default_font(), 36 * 2)

    text_surface = font.render('Hello world', True, (0, 0, 0))
    GRAY = (200, 200, 200)
    # yamnet end testing
    with recorder:
        last_detection = -1
        while not timed_out:
            audio_sample = recorder.get_audio(7921)[0]
            if False:
                wavfile.write('test.wav', 16000, audio_sample)
                wav_data, sr = sf.read('test.wav', dtype=np.int16)
            else:
                wav_data = np.array(audio_sample, dtype=np.int16)
                sr = AUDIO_SAMPLE_RATE_HZ
            assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
            waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
            waveform = waveform.astype('float32')
            # Convert to mono and the sample rate expected by YAMNet.
            if len(waveform.shape) > 1:
                waveform = np.mean(waveform, axis=1)
            if sr != params.sample_rate:
                waveform = resampy.resample(waveform, sr, params.sample_rate)
            print('-------')
            # Predict YAMNet classes.
            scores, embeddings, spectrogram = yamnet(waveform)
            # Scores is a matrix of (time_frames, num_classes) classifier scores.
            # Average them along time to get an overall classifier output for the clip.
            prediction = np.mean(scores, axis=0)
            # Report the highest-scoring classes and their scores.
            top5_i = np.argsort(prediction)[::-1][:5]
            print(':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))
            print('{}:{:.3f}'.format(yamnet_classes[42], prediction[42]))
            print('{}:{:.3f}'.format(yamnet_classes[0], prediction[0]))
            print('{}:{:.3f}'.format(yamnet_classes[494], prediction[494]))

            target_predictions = prediction[42], prediction[0], prediction[494]
            target_classes = yamnet_classes[42], yamnet_classes[
                0], yamnet_classes[494]
            index = np.argsort(target_predictions)[::-1][0]
            black = (0, 0, 0)
            green = (0, 255, 0)
            red = (255, 0, 0)
            if index == 0:
                color = red
            elif index == 1:
                color = green
            else:
                color = black
            text1 = font.render(target_classes[index], True, color)
            header1 = font_header.render('R-zero Device Listening for Audio',
                                         True, (0, 0, 0))
            screen.fill(GRAY)
            screen.blit(header1, dest=(20, 100))
            screen.blit(text1, dest=(200, 200))
            pygame.display.update()
            '''
      line = '{}:{:.3f}'.format(yamnet_classes[42], prediction[42])
      label = Tk.Label(None, text=line, font=('Times', '18'), fg='blue')
      label.pack()
      label.mainloop()
      '''
            # End
            """
Beispiel #20
0
def main(argv):
    assert argv, 'Usage: inference.py <wav file> <wav file> ...'

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

        print('waveform original dtaa', wav_data.shape)

        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')
        print('waveform normal dtaa', waveform.shape)

        print('sampling rate', sr)
        print('sampling rate model params', params.sample_rate)

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            print('entered')
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)

        # plt.figure(figsize=(20, 8))
        # plt.plot(waveform)
        # plt.xlabel('Samples')
        # plt.ylabel('Amplitude')
        # # plt.savefig('waveform.png')
        # plt.show()
        # plt.close()

        print('waveform sample dtaa', waveform.shape)
        # Predict YAMNet classes.
        scores, embeddings, spectrogram = yamnet(waveform)
        print('scores', scores)
        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores, axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))

        truth_labels = [yamnet_classes[i] for i in top5_i]
        print('ground labels', truth_labels)
        total_time = 0

        # plt.figure(figsize=(20, 8))
        # plt.plot(scores[:,282].numpy(),label='water')
        # plt.plot(scores[:,364].numpy(),label='faucet')
        # plt.plot(scores[:,365].numpy(),label='sink')
        # plt.legend()
        # plt.show()
        # plt.close()

        for i in range(len(scores)):
            pred = scores[i]

            water_prob = pred[282].numpy()
            print('water_prob', water_prob)
            top5_i = np.argsort(pred)[::-1][:5]
            print(
                file_name, ':\n' +
                '\n'.join('  {:12s}: {:.3f}'.format(yamnet_classes[i], pred[i])
                          for i in top5_i))

            pred_class = yamnet_classes[top5_i[0]]
            print(pred_class)
            if pred_class in truth_labels:
                total_time += 0.96

        print('total time', total_time / 2)
Beispiel #21
0
    )

import numpy as np
import resampy
import soundfile as sf
import tensorflow as tf

import params
import yamnet as yamnet_model

graph = tf.Graph()
with graph.as_default():
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights("./checkpoint")

yamnet_classes = yamnet_model.class_names("yamnet_class_map.csv")


def read_wav(w, max_audio_time=30):
    wav_data, sr = sf.read(w, dtype=np.int16)
    waveform = wav_data / 32768.0

    if len(waveform.shape) > 1:
        waveform = np.mean(waveform, axis=1)

    waveform = waveform[:max_audio_time * params.SAMPLE_RATE * 1000]

    if sr != params.SAMPLE_RATE:
        waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE)

    return waveform
Beispiel #22
0
def load_model():
    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')
    return yamnet, yamnet_classes
 def setUpClass(cls):
     super().setUpClass()
     cls._params = params.Params()
     cls._yamnet = yamnet.yamnet_frames_model(cls._params)
     cls._yamnet.load_weights('yamnet.h5')
     cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
Beispiel #24
0
def main(argv):

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

        print('waveform original dtaa', wav_data.shape)

        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')
        print('waveform normal dtaa', waveform.shape)

        print('sampling rate', sr)
        print('sampling rate model params', params.sample_rate)

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            print('entered')
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)
        print('waveform normal dtaa', waveform.shape)

        scale = 2.5

        # fig = plt.figure(figsize=(int(scale*4), int(scale*3)))
        # camera = Camera(fig)

        # for i in range(0,len(waveform),int(0.96*params.sample_rate/int(8))):
        # 	plt.plot(waveform[:i],color='b')
        # 	plt.xlabel('Samples')
        # 	plt.ylabel('Amplitude')
        # 	camera.snap()
        # animation = camera.animate()
        # animation.save(file_name+'_filename_'+str(scale)+'.mp4')
        # plt.close()

        # Predict YAMNet classes.
        scores, embeddings, spectrogram = yamnet(waveform)
        print('scores', scores)
        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores, axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))

        # colors=['b','g','r']
        # fig=plt.figure()
        # camera = Camera(fig)
        # plt.xlabel('Time(0.5s)')
        # plt.ylabel('Probability')
        # for j in range(1,len(scores)):
        # 	k=0
        # 	for i in top5_i[1:-1]:

        # 		x=np.convolve(scores[:j,i].numpy(), np.ones((4,))/4, mode='valid')
        # 		# x=scores[:j,i].numpy()
        # 		plt.plot(x,color=colors[k])
        # 		k+=1
        # 	for i in range(1):

        # 		camera.snap()
        # plt.legend([yamnet_classes[i] for i in top5_i[1:-1]],loc='upper right')
        # animation = camera.animate(interval=int(1000))

        # # plt.show()
        # # plt.close()
        # animation.save(file_name+'_class_'+str(scale)+'.mp4')

        colors = ['b', 'g', 'r']
        fig = plt.figure()
        camera = Camera(fig)
        plt.xlabel('Time(0.5s)')
        plt.ylabel('volume')
        vol_store = []
        total_vol = 0
        for j in range(len(scores)):

            vol = []
            for i in top5_i[1:-1]:

                # x=np.convolve(scores[j,i].numpy(), np.ones((4,))/4, mode='valid')
                x = scores[j, i].numpy()
                if x > 0.1:
                    vol.append(float(1 / 24))
            # print(vol)
            if vol:
                total_vol += np.mean(vol)
            print(total_vol)
            vol_store.append(total_vol)
            # print(vol_store)
            plt.plot(vol_store, color='b')
            camera.snap()
        # plt.legend(,loc='upper right')
        animation = camera.animate(interval=int(1000))

        # plt.show()
        # plt.close()
        animation.save(file_name + '_volume_' + str(scale) + '.mp4')