コード例 #1
0
ファイル: inference.py プロジェクト: cookiekop/Project_Jarvis
def main(argv):
  assert argv

  graph = tf.Graph()
  with graph.as_default():
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
  yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

  for file_name in argv:
    # Decode the WAV file.
    wav_data, sr = sf.read(file_name, dtype=np.int16)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]

    # Convert to mono and the sample rate expected by YAMNet.
    if len(waveform.shape) > 1:
      waveform = np.mean(waveform, axis=1)
    if sr != params.SAMPLE_RATE:
      waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE)

    # Predict YAMNet classes.
    # Second output is log-mel-spectrogram array (used for visualizations).
    # (steps=1 is a work around for Keras batching limitations.)
    with graph.as_default():
      scores, _ = yamnet.predict(np.reshape(waveform, [1, -1]), steps=1)
    # Scores is a matrix of (time_frames, num_classes) classifier scores.
    # Average them along time to get an overall classifier output for the clip.
    prediction = np.mean(scores, axis=0)
    # Report the highest-scoring classes and their scores.
    top5_i = np.argsort(prediction)[::-1][:5]
    print(file_name, ':\n' + 
          '\n'.join('  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                    for i in top5_i))
コード例 #2
0
 def setUpClass(cls):
     super(YAMNetTest, cls).setUpClass()
     cls._yamnet_graph = tf.Graph()
     with cls._yamnet_graph.as_default():
         cls._yamnet = yamnet.yamnet_frames_model(params)
         cls._yamnet.load_weights('yamnet.h5')
         cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
コード例 #3
0
    def embedding(self, input_paths, output_paths, embed_paths=""):
        """Extract YAMnet features with opensmile using a single process."""
        if embed_paths == "":
            embed_paths = [""] * len(input_paths)
            save_embedding = False
        else:
            save_embedding = True

        paths = list(zip(input_paths, embed_paths, output_paths))

        params = yamnet_params.Params(sample_rate=self.sample_rate,
                                      patch_hop_seconds=0.48)

        class_names = yamnet_model.class_names(self.class_names)
        yamnet = yamnet_model.yamnet_frames_model(params)
        yamnet.load_weights(self.model_checkpoint)

        func = partial(
            self._embed,
            yamnet=yamnet,
            params=params,
            class_names=class_names,
            save_embedding=save_embedding,
        )

        self.single_process(func, paths)
コード例 #4
0
def yamnet_grad_test():
    waveform = np.reshape(
        np.sin(2 * np.pi * 440 * np.linspace(0, 3, num=int(3 * 16000))),
        [1, -1])

    print(waveform[0])
    wavfile.write('sine.wav', 16000, waveform[0])
    model = yamnet_frames_model(params)
    model.load_weights('yamnet.h5')
    classes = class_names('yamnet_class_map.csv')

    with tf.GradientTape() as grad_tape:
        audio_tensor = tf.convert_to_tensor(np.reshape(waveform, [1, -1]))
        print(f'Audio Tensor is: {type(audio_tensor)}')
        grad_tape.watch(audio_tensor)
        # scores, spectrograms = model.predict(audio_tensor, steps=1)
        scores, spectrograms = model(audio_tensor)
        print(f'Scores is: {type(scores)}')

        target_scores = scores.numpy()
        assert target_scores.shape == scores.shape
        target_scores[:, 0] = 1
        target_scores = tf.convert_to_tensor(target_scores)

        loss = tf.keras.losses.MSE(target_scores, scores)

    gradient_tensor = grad_tape.gradient(loss, audio_tensor)
    print(scores[0])
    print(classes[np.argsort(scores[0])[-3:]])
    print(gradient_tensor.shape)
    print(audio_tensor.shape)

    output_tensor = audio_tensor + 1000 * gradient_tensor
    wavfile.write('speechy.wav', 16000, output_tensor[0].numpy())
    wavfile.write('grad.wav', 16000, 1000 * gradient_tensor[0].numpy())
コード例 #5
0
def main(argv):
    assert argv, 'Usage: inference.py <wav file> <wav file> ...'

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)

        # Predict YAMNet classes.
        scores, embeddings, spectrogram = yamnet(waveform)
        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores, axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))
コード例 #6
0
def main():
    # Load the model and weights
    model = yamnet.yamnet_frames_model(params)
    model.load_weights('yamnet.h5')

    # Convert the model
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    tflite_model = converter.convert()
    open("yamnet.tflite", "wb").write(tflite_model)
コード例 #7
0
def convert_general(general_model):
    #general = load_model(general_model)
    model_general = yamnet_frames_model(params)
    model_general.load_weights(
        '/home/pc/PycharmProjects/yamnet_medium/output/yamnet.tflite',
        by_name=True)
    print(model_general.summary())

    converter = tf.lite.TFLiteConverter.from_keras_model(model_general)
    tflite_model = converter.convert()
    open("general_model.tflite", "wb").write(tflite_model)
コード例 #8
0
 def __init__(self):
     physical_devices = tf.config.experimental.list_physical_devices('GPU')
     tf.config.experimental.set_virtual_device_configuration(
         physical_devices[0], [
             tf.config.experimental.VirtualDeviceConfiguration(
                 memory_limit=4096)
         ])
     self.graph = tf.Graph()
     with self.graph.as_default():
         self.yamnet = yamnet_model.yamnet_frames_model(params)
         self.yamnet.load_weights('yamnet/yamnet.h5')
     self.yamnet_classes = yamnet_model.class_names(
         'yamnet/yamnet_class_map.csv')
コード例 #9
0
def get_model():

    # Build network
    yamnet = yamnet_model.yamnet_frames_model(params)

    yamnet.load_weights('yamnet.h5', by_name=True)
    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    yamnet.compile(optimizer=sgd,
                   loss='categorical_crossentropy',
                   metrics=[
                       'accuracy',
                       keras.metrics.Precision(),
                       keras.metrics.Recall()
                   ])

    return yamnet
コード例 #10
0
    def __init__(self, config_path="./config.yaml"):
        """Init method for the Searcher."""
        super().__init__()
        # Load the configuration
        conf = OmegaConf.load(config_path)
        self.dataset_path = conf.dataset_path
        self.audio_path = os.path.join(conf.dataset_path, "podcasts-audio")

        self.es_url = conf.search_es_url  # URL of Elasticsearch to query
        self.es_num = (conf.search_es_num
                       )  # Number of segments to request from Elasticsearch
        self.sample_rate = 44100  # Hardcoded sample rate of all podcast audio

        # Load the podcast metadata
        self.metadata = load_metadata(self.dataset_path)

        # Set up the reranking model
        self.rerank_tokenizer = AutoTokenizer.from_pretrained(
            conf.search_rerank_model,
            use_fast=True,
            cache_dir=conf.search_cache_dir)
        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(
            conf.search_rerank_model, cache_dir=conf.search_cache_dir)
        self.rerank_model.to("cpu", non_blocking=True)
        self.rerank_max_seq_len = 512

        # Set up the openSMILE extractor
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
            options={
                "frameModeFunctionalsConf":
                os.path.join(
                    os.getenv("PODCAST_PATH"),
                    "data/custom_FrameModeFunctionals.conf.inc",
                )
            },
        )

        # Set up the YAMNet model
        params = yamnet_params.Params(sample_rate=self.sample_rate,
                                      patch_hop_seconds=0.48)
        self.yamnet_classes = yamnet_model.class_names(
            os.path.join(os.getenv("YAMNET_PATH"), "yamnet_class_map.csv"))
        self.yamnet_model = yamnet_model.yamnet_frames_model(params)
        self.yamnet_model.load_weights(
            os.path.join(os.getenv("PODCAST_PATH"), "data/yamnet.h5"))
コード例 #11
0
def main(argv):
  assert argv, 'Usage: inference.py <wav file> <wav file> ...'

  model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet.h5')
  classes_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet_class_map.csv')
  event_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'event.json')
  
  params = yamnet_params.Params()
  yamnet = yamnet_model.yamnet_frames_model(params)
  yamnet.load_weights(model_path)
  yamnet_classes = yamnet_model.class_names(classes_path)

  for file_name in argv:
    # Decode the WAV file.
    wav_data, sr = sf.read(file_name, dtype=np.int16)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    waveform = waveform.astype('float32')

    # Convert to mono and the sample rate expected by YAMNet.
    if len(waveform.shape) > 1:
      waveform = np.mean(waveform, axis=1)
    if sr != params.sample_rate:
      waveform = resampy.resample(waveform, sr, params.sample_rate)

    # Predict YAMNet classes.
    scores, embeddings, spectrogram = yamnet(waveform)
    # Scores is a matrix of (time_frames, num_classes) classifier scores.
    # Average them along time to get an overall classifier output for the clip.
    prediction = np.mean(scores, axis=0)
    # Report the highest-scoring classes and their scores.
    top5_i = np.argsort(prediction)[::-1][:5]
    print(file_name, ':\n' +
          '\n'.join('  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                    for i in top5_i))
    
    # print all classes
    b = prediction.tolist() # nested lists with same data, indices
    pred = []
    for (i,cls) in enumerate(yamnet_classes):
      item={}
      item['label']=cls
      item['value']=round(b[i], 6)
      pred.append(item)
    pred = sorted(pred, key=lambda x: x['value'], reverse=True)
    json.dump(pred, codecs.open(event_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4) ### this saves the array in .json format
コード例 #12
0
def create_dataset(path):
    samples, labels = [], []
    model = yamnet_frames_model(Params())
    model.load_weights(YAMNET_PATH)
    for cls in os.listdir(path):
        for sound in tqdm(os.listdir(os.path.join(path, cls))):
            wav = librosa.load(os.path.join(os.path.join(path, cls, sound)),
                               sr=16000)[0].astype(np.float32)

            #Here you can add preprocessing, augmentations, silence removal, etc.

            for feature in model(wav)[1]:
                samples.append(feature)
                labels.append(cls)
    samples = np.asarray(samples)
    labels = np.asarray(labels)
    return samples, labels
コード例 #13
0
def main(argv):
    global analysisdata, frame_counter
    log = open('/tmp/sound.log', 'w')
    # Set up yamnet
    params = yamnet_params.Params(sample_rate=ANALYSIS_SAMPLE_RATE,
                                  patch_hop_seconds=0.1)
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('/home/pi/models/research/audioset/yamnet/yamnet.h5')
    yamnet_classes = yamnet_model.class_names(
        '/home/pi/models/research/audioset/yamnet/yamnet_class_map.csv')
    # Set up a live callback stream from the microphone
    stream = sd.InputStream(device=1,
                            channels=1,
                            samplerate=RECORD_SAMPLE_RATE,
                            callback=audio_callback,
                            blocksize=BUFFER_SIZE_F)
    with stream:
        while True:
            update_analysis_window()
            if (frame_counter >= int(
                    ANALYSIS_LENGTH_S * ANALYSIS_SAMPLE_RATE)):
                frame_counter = 0
                scores = yamnet.predict(analysisdata, steps=1)[0]
                if (len(scores)):
                    prediction = np.mean(scores, axis=0)
                    top5_i = np.argsort(prediction)[::-1][:1]
                    for x in top5_i:
                        if (prediction[x] > THRESHOLD):
                            top_class_str = yamnet_classes[x]
                            # Write any detected class (outside these noisy ones) to the log
                            if (not top_class_str in [
                                    "Fireworks", "Silence",
                                    "Inside, small room"
                            ]):
                                log.write("[%s] %s %0.4f\n" %
                                          (datetime.now().strftime(
                                              "%m/%d/%Y %H:%M:%S"),
                                           top_class_str, prediction[x]))
                                log.flush()
                                # And if it's one of the doorbell ones, ping the homebridge server
                                if (top_class_str in [
                                        "Beep, bleep", "Doorbell", "Glass",
                                        "Ding"
                                ]):
                                    trigger_homekit_motion()
コード例 #14
0
def main():

    # Load yamnet
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')

    # Convert the model
    class_names = [
        re.sub(r'\ |\(|\)|,|-|\'', '', x.lower())
        for x in yamnet_model.class_names('yamnet_class_map.csv')
    ]

    frame = RfcxFrame(yamnet, params.SAMPLE_RATE, params.PATCH_WINDOW_SECONDS,
                      class_names, 'pcm_s16le')
    tf.saved_model.save(frame,
                        'model',
                        signatures={
                            "score": frame.score,
                            "metadata": frame.metadata
                        })
コード例 #15
0
ファイル: dreamsound.py プロジェクト: fdch/dreamsound
    def load_model(self, layer=None):
        """
        This function loads the yamnet model with a specified layer and
        returns a 'dreamer' model that returns the activations of such layer
        
        Parameters
        -----------
        layer (string) : a specified layer

        If `layer` is not specified, the last layer is used instead.

        Returns
        ----------
        (tf.keras.Model) : the dreamer model
        
        """

        # load its class names
        self.class_names = yamnet.class_names(self.class_file)
        self.class_names_tensor = tf.constant(self.class_names)
        # load model parameters and get model
        self.params = params.Params(sample_rate=self.sr,
                                    patch_hop_seconds=self.patch_hop)
        self.model = yamnet.yamnet_frames_model(self.params)
        # load model weigths
        self.model.load_weights(self.weights_file)
        if layer is not None:
            self.layername = layer
        else:
            self.__print__("Using last layer.")
            self.layername = self.model.layers[-1].name
        self.__print__(f"Yamnet loaded, using layer:{self.layername}")
        # Get the specified layer
        self.layers = self.model.get_layer(self.layername).output
        # Finally, create the dreamer model
        self.dreamer = tf.keras.Model(inputs=self.model.input,
                                      outputs=self.layers)
        self.__print__("Dreamer started.")
        return self.dreamer
コード例 #16
0
def classification(argv):
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    #yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    sr = 44100

    # Convert to mono and the sample rate expected by YAMNet.
    if len(waveform.shape) > 1:
        waveform = np.mean(waveform, axis=1)
    if sr != params.SAMPLE_RATE:
        waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE)

    # Predict YAMNet classes.
    # Second output is log-mel-spectrogram array (used for visualizations).
    # (steps=1 is a work around for Keras batching limitations.)
    scores, _ = yamnet.predict(np.reshape(waveform, [1, -1]), steps=1)
    # Scores is a matrix of (time_frames, num_classes) classifier scores.
    # Average them along time to get an overall classifier output for the clip.
    prediction = np.mean(scores, axis=0)
    # Report the highest-scoring classes and their scores.
    #sound_events = np.argsort(prediction)[::-1]
    return prediction
コード例 #17
0
def main(argv):

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

        print('waveform original dtaa', wav_data.shape)

        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')
        print('waveform normal dtaa', waveform.shape)

        print('sampling rate', sr)
        print('sampling rate model params', params.sample_rate)

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            print('entered')
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)

        print(waveform.shape, min(waveform))
        # plt.figure(figsize=(20, 8))
        # plt.plot(waveform)
        # plt.xlabel('Samples')
        # plt.ylabel('Amplitude')
        # # plt.savefig('waveform.png')
        # plt.show()
        # plt.close()

        # fig, ax = plt.subplots(figsize=(20, 8))
        fig = plt.figure()
        ax = plt.axes(xlim=(0, len(waveform)), ylim=(-0.16, 0.17))

        line, = ax.plot([], [], lw=1)

        def init():
            line.set_data([], [])
            return line,

        def animate(i):
            x = np.linspace(0, len(waveform), len(waveform))
            y = waveform[i]
            line.set_data(x, y)
            return line,

        anim = FuncAnimation(fig,
                             animate,
                             init_func=init,
                             frames=200,
                             interval=20,
                             blit=True)

        plt.draw()
        plt.show()
コード例 #18
0
 def __init__(self, weights_path, params):
   super().__init__()
   self._yamnet = yamnet.yamnet_frames_model(params)
   self._yamnet.load_weights(weights_path)
   self._class_map_asset = tf.saved_model.Asset('yamnet_class_map.csv')
コード例 #19
0
ファイル: bak_bada_audio.py プロジェクト: manoj04418/BADA_G2
boilKeys = [
    'Boiling',
    'Liquid',
    'Water',
]  # 'Water', 'Pour', 'Drip'
waterKeys = ['Water tap, faucet', 'Sink (filling or washing)']
signals = dict.fromkeys(keys, 0.0)
picked = dict.fromkeys(keys, 0.0)
detected = dict.fromkeys(keys, False)
detectThreshold = 0.65
checkThreshold = 0.25
resetThreshold = 0.05

# Set up the YAMNet model.
params.PATCH_HOP_SECONDS = 0.48  # 10 Hz scores frame rate. //0.1
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet.h5')
class_names = yamnet_model.class_names('yamnet_class_map.csv')

CHUNKSIZE = 16000  # fixed chunk size
sr = 16000
seconds = 1
predictionPeriod = 2.0
predictionRate = 2.0
predChunkSize = int(sr * predictionPeriod)
readChunkSize = int(sr * predictionRate)

duration = 50

frames = []
last5secFrames = []
コード例 #20
0
def classify_audio(audio_device_index,
                   interpreter,
                   labels_file,
                   commands_file=None,
                   result_callback=None,
                   dectection_callback=None,
                   sample_rate_hz=16000,
                   negative_threshold=0.6,
                   num_frames_hop=33):
    """Acquire audio, preprocess, and classify."""
    # Initialize recorder.
    AUDIO_SAMPLE_RATE_HZ = sample_rate_hz
    downsample_factor = 1
    if AUDIO_SAMPLE_RATE_HZ == 48000:
        downsample_factor = 3
    # Most microphones support this
    # Because the model expects 16KHz audio, we downsample 3 fold
    recorder = audio_recorder.AudioRecorder(
        AUDIO_SAMPLE_RATE_HZ,
        downsample_factor=downsample_factor,
        device_index=audio_device_index)
    feature_extractor = Uint8LogMelFeatureExtractor(
        num_frames_hop=num_frames_hop)
    labels = read_labels(labels_file)
    if commands_file:
        commands = read_commands(commands_file)
    else:
        commands = {}
    logger.info("Loaded commands: %s", str(commands))
    logger.info("Recording")
    timed_out = False

    # Testing
    if False:
        sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav'

        import tensorflow as tf
        import os

        def decode_audio(audio_binary):
            audio, _ = tf.audio.decode_wav(audio_binary)
            return tf.squeeze(audio, axis=-1)

        def get_label(file_path):
            parts = tf.strings.split(file_path, os.path.sep)

            # Note: You'll use indexing here instead of tuple unpacking to enable this
            # to work in a TensorFlow graph.
            return parts[-2]

        def get_waveform_and_label(file_path):
            label = get_label(file_path)
            audio_binary = tf.io.read_file(file_path)
            waveform = decode_audio(audio_binary)
            return waveform, label

        waveform, label = get_waveform_and_label(sample_data)
        print(waveform.shape)
    # End Testing

    # yamnet start testing
    import os
    import soundfile as sf
    import params as yamnet_params
    import yamnet as yamnet_model
    from scipy.io import wavfile
    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    if not os.path.exists('yamnet.h5'):
        print(
            'Error: curl -O https://storage.googleapis.com/audioset/yamnet.h5')
        exit()
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    import pygame
    pygame.init()
    screen = pygame.display.set_mode((640, 480))
    font_header = pygame.font.Font(pygame.font.get_default_font(), 36)
    font = pygame.font.Font(pygame.font.get_default_font(), 36 * 2)

    text_surface = font.render('Hello world', True, (0, 0, 0))
    GRAY = (200, 200, 200)
    # yamnet end testing
    with recorder:
        last_detection = -1
        while not timed_out:
            audio_sample = recorder.get_audio(7921)[0]
            if False:
                wavfile.write('test.wav', 16000, audio_sample)
                wav_data, sr = sf.read('test.wav', dtype=np.int16)
            else:
                wav_data = np.array(audio_sample, dtype=np.int16)
                sr = AUDIO_SAMPLE_RATE_HZ
            assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
            waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
            waveform = waveform.astype('float32')
            # Convert to mono and the sample rate expected by YAMNet.
            if len(waveform.shape) > 1:
                waveform = np.mean(waveform, axis=1)
            if sr != params.sample_rate:
                waveform = resampy.resample(waveform, sr, params.sample_rate)
            print('-------')
            # Predict YAMNet classes.
            scores, embeddings, spectrogram = yamnet(waveform)
            # Scores is a matrix of (time_frames, num_classes) classifier scores.
            # Average them along time to get an overall classifier output for the clip.
            prediction = np.mean(scores, axis=0)
            # Report the highest-scoring classes and their scores.
            top5_i = np.argsort(prediction)[::-1][:5]
            print(':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))
            print('{}:{:.3f}'.format(yamnet_classes[42], prediction[42]))
            print('{}:{:.3f}'.format(yamnet_classes[0], prediction[0]))
            print('{}:{:.3f}'.format(yamnet_classes[494], prediction[494]))

            target_predictions = prediction[42], prediction[0], prediction[494]
            target_classes = yamnet_classes[42], yamnet_classes[
                0], yamnet_classes[494]
            index = np.argsort(target_predictions)[::-1][0]
            black = (0, 0, 0)
            green = (0, 255, 0)
            red = (255, 0, 0)
            if index == 0:
                color = red
            elif index == 1:
                color = green
            else:
                color = black
            text1 = font.render(target_classes[index], True, color)
            header1 = font_header.render('R-zero Device Listening for Audio',
                                         True, (0, 0, 0))
            screen.fill(GRAY)
            screen.blit(header1, dest=(20, 100))
            screen.blit(text1, dest=(200, 200))
            pygame.display.update()
            '''
      line = '{}:{:.3f}'.format(yamnet_classes[42], prediction[42])
      label = Tk.Label(None, text=line, font=('Times', '18'), fg='blue')
      label.pack()
      label.mainloop()
      '''
            # End
            """
コード例 #21
0
def main():

    EPOCHS = 1000

    f_X_train = 0
    f_y_train = 1
    f_X_val = 2
    f_y_val = 3

    # General log variables
    accuracy_train_scores, accuracy_validation_scores, accuracy_test_scores = [], [], []
    precision_train_scores, precision_validation_scores, precision_test_scores = [], [], []
    recall_train_scores, recall_validation_scores, recall_test_scores = [], [], []
    train_error, validation_error, test_error = [], [], []

    # Log variables for each class
    accuracy_train_per_class, accuracy_validation_per_class, accuracy_test_per_class = {}, {}, {}
    precision_train_per_class, precision_validation_per_class, precision_test_per_class = {}, {}, {}
    recall_train_per_class, recall_validation_per_class, recall_test_per_class = {}, {}, {}
    f1_score_train_per_class, f1_score_validation_per_class, f1_score_test_per_class = {}, {}, {}

    # Initialize dictionaries for each metric
    accuracy_train_per_class, accuracy_validation_per_class, accuracy_test_per_class = util.initialize_metrics_per_class(
        classes, accuracy_train_per_class, accuracy_validation_per_class,
        accuracy_test_per_class)
    precision_train_per_class, precision_validation_per_class, precision_test_per_class = util.initialize_metrics_per_class(
        classes, precision_train_per_class, precision_validation_per_class,
        precision_test_per_class)
    recall_train_per_class, recall_validation_per_class, recall_test_per_class = util.initialize_metrics_per_class(
        classes, recall_train_per_class, recall_validation_per_class,
        recall_test_per_class)
    f1_score_train_per_class, f1_score_validation_per_class, f1_score_test_per_class = util.initialize_metrics_per_class(
        classes, f1_score_train_per_class, f1_score_validation_per_class,
        f1_score_test_per_class)

    all_files = util.get_files_path()[4:]

    # Build network
    yamnet = yamnet_model.yamnet_frames_model(params, fine_tuning=False)
    yamnet.load_weights('yamnet.h5')

    get_feature_layer_output = K.function([yamnet.layers[0].input],
                                          [yamnet.layers[-3].output])

    waveforms = {}
    labels = []
    for file in all_files:
        # Decode the WAV file.
        wav_data, sr = sf.read(file, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        label = file.split('\\')[-2][-1]
        label = labels_dict[label]

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            waveform = np.mean(waveform, axis=1)
        if sr != params.SAMPLE_RATE:
            waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE)

        avg = len(waveform) / float(5)
        last = 0.0
        waveforms[label] = []

        while last < len(waveform):
            waveforms[label].append(waveform[int(last):int(last + avg)])
            labels.append(label)
            last += avg

    folds, X_test, Y_test = util.build_folds_test(waveforms, labels, classes)

    X_T = []
    Y_T = []
    for x, y in zip(X_test, Y_test):
        a = get_feature_layer_output([np.reshape(x, [1, -1])])[0]
        for i in a:
            X_T.append(i)
            Y_T.append(y)

    X_T = np.array(X_T)
    Y_T = np.array(Y_T)

    Y_T = to_categorical(Y_T)

    count = 1
    for fold in folds:

        print("Fold %d:\n" % count)

        X = []
        X_V = []
        Y = []
        Y_V = []
        for x, y in zip(fold[f_X_train], fold[f_y_train]):
            a = get_feature_layer_output([np.reshape(x, [1, -1])])[0]
            for i in a:
                X.append(i)
                Y.append(y)

        for x, y in zip(fold[f_X_val], fold[f_y_val]):
            v = get_feature_layer_output([np.reshape(x, [1, -1])])[0]
            for i in v:
                X_V.append(i)
                Y_V.append(y)

        model = get_model()

        X = np.array(X)
        Y = np.array(Y)

        Y = to_categorical(Y)

        X_V = np.array(X_V)
        Y_V = np.array(Y_V)

        Y_V = to_categorical(Y_V)

        # Train and Validation
        #callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
        history = model.fit(X,
                            Y,
                            epochs=EPOCHS,
                            batch_size=32,
                            validation_data=(X_V, Y_V),
                            verbose=False)  #, callbacks=[callback])

        # Predict values
        y_pred = model.predict(X)

        # Get precision, recall and F1-score
        y_true = np.argmax(Y, axis=1)
        y_pred = np.argmax(y_pred, axis=1)
        report = classification_report(y_true, y_pred, output_dict=True)
        accuracy = util.per_class_accuracy(y_pred, y_true, classes)

        for c in classes:
            accuracy_train_per_class[c].append(accuracy[c])
            f1_score_train_per_class[c].append(report[str(c)]['f1-score'])
            precision_train_per_class[c].append(report[str(c)]['precision'])
            recall_train_per_class[c].append(report[str(c)]['recall'])

        # Save train and validation accuracy
        accuracy_train_scores.append(history.history['accuracy'])
        accuracy_validation_scores.append(history.history['val_accuracy'])

        # Save train and validation precision
        precision_train_scores.append(history.history['precision_' +
                                                      str(count)])
        precision_validation_scores.append(history.history['val_precision_' +
                                                           str(count)])

        # Save train and validation recall
        recall_train_scores.append(history.history['recall_' + str(count)])
        recall_validation_scores.append(history.history['val_recall_' +
                                                        str(count)])

        # Save train and validation error
        train_error.append(history.history['loss'])
        validation_error.append(history.history['val_loss'])

        y_pred = model.predict(X_V)
        y_true = np.argmax(Y_V, axis=1)
        y_pred = np.argmax(y_pred, axis=1)
        report = classification_report(y_true, y_pred, output_dict=True)
        accuracy = util.per_class_accuracy(y_pred, y_true, classes)

        for c in classes:
            accuracy_validation_per_class[c].append(accuracy[c])
            precision_validation_per_class[c].append(
                report[str(c)]['precision'])
            recall_validation_per_class[c].append(report[str(c)]['recall'])
            f1_score_validation_per_class[c].append(report[str(c)]['f1-score'])

        score = model.evaluate(X_T, Y_T)

        # Save error, accuracy and precision
        test_error.append(score[0])
        accuracy_test_scores.append(score[1])
        precision_test_scores.append(score[2])
        recall_test_scores.append(score[3])

        #print("Training accuracy: %.2f%%" % (history.history['accuracy'][-1]*100))
        #print("Testing accuracy: %.2f%%" % (history.history['val_accuracy'][-1]*100))
        count += 1

        y_pred = model.predict(X_T)
        y_true = np.argmax(Y_T, axis=1)
        y_pred = np.argmax(y_pred, axis=1)
        report = classification_report(y_true, y_pred, output_dict=True)
        accuracy = util.per_class_accuracy(y_pred, y_true, classes)

        for c in classes:
            accuracy_test_per_class[c].append(accuracy[c])
            f1_score_test_per_class[c].append(report[str(c)]['f1-score'])
            precision_test_per_class[c].append(report[str(c)]['precision'])
            recall_test_per_class[c].append(report[str(c)]['recall'])

    print("Training information")
    util.print_mean(classes, accuracy_train_per_class,
                    f1_score_train_per_class, precision_train_per_class,
                    recall_train_per_class)
    util.print_std(classes, accuracy_train_per_class, f1_score_train_per_class,
                   precision_train_per_class, recall_train_per_class)

    print("Validation information")
    util.print_mean(classes, accuracy_validation_per_class,
                    f1_score_validation_per_class,
                    precision_validation_per_class,
                    recall_validation_per_class)
    util.print_std(classes, accuracy_validation_per_class,
                   f1_score_validation_per_class,
                   precision_validation_per_class, recall_validation_per_class)

    print("Test information")
    util.print_mean(classes, accuracy_test_per_class, f1_score_test_per_class,
                    precision_test_per_class, recall_test_per_class)
    util.print_std(classes, accuracy_test_per_class, f1_score_test_per_class,
                   precision_test_per_class, recall_test_per_class)

    plt.plot(accuracy_train_scores, accuracy_validation_scores, EPOCHS,
             "Treinamento", "Validação", "Acurácia")
    plt.plot(precision_train_scores, precision_validation_scores, EPOCHS,
             "Treinamento", "Validação", "Precisão")
    plt.plot(recall_train_scores, recall_validation_scores, EPOCHS,
             "Treinamento", "Validação", "Recall")

    #TODO: fix loss plot
    #plt.plot_loss(losses, val_losses, epochs)

    for c in classes:
        util.save_to_file_per_class(
            accuracy_train_per_class[c], accuracy_validation_per_class[c],
            precision_train_per_class[c], precision_validation_per_class[c],
            recall_train_per_class[c], recall_validation_per_class[c],
            accuracy_test_per_class[c], precision_test_per_class[c],
            recall_test_per_class[c], "logs_per_class_" + str(c) + ".txt")

    util.save_to_file(accuracy_train_scores, accuracy_validation_scores,
                      precision_train_scores, precision_validation_scores,
                      recall_train_scores, recall_validation_scores,
                      accuracy_test_scores, precision_test_scores,
                      recall_test_scores, train_error, validation_error,
                      test_error, "logs.txt")
    return
コード例 #22
0
 def setUpClass(cls):
     super().setUpClass()
     cls._params = params.Params()
     cls._yamnet = yamnet.yamnet_frames_model(cls._params)
     cls._yamnet.load_weights('yamnet.h5')
     cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
コード例 #23
0
def load_model():
    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')
    return yamnet, yamnet_classes
コード例 #24
0
    def __init__(self, *args, **kwargs):

        #Constructor, builds the tkinter app and used frames

        #Run the base class init
        tk.Tk.__init__(self, *args, **kwargs)
        """TODO Initialize Yamnet"""
        self.yamnet = yamnet_model.yamnet_frames_model(params)
        self.yamnet.load_weights('yamnet.h5')

        #Prepare the visualization graph. Tight layout for fitting better
        self.figure, self.axs = plt.subplots(10, figsize=(10, 10))
        plt.tight_layout()

        #Prepare colors. Colors are xkcd-colors in random order
        with open('colors.txt', 'r') as colorfile:
            self.colors = colorfile.readlines()

        #Strip newlines for more efficient use
        for i in range(len(self.colors)):
            self.colors[i] = self.colors[i][:-1]

        #Prepare Yamnet class names
        with open('classes.txt', 'r') as classesfile:
            self.classes = classesfile.readlines()

        #Strip newlines for more efficient use
        for i in range(len(self.classes)):
            self.classes[i] = self.classes[i][:-1]

        #Base frame to build the used frames from
        container = tk.Frame(self)
        container.pack(side="top", fill="both", expand=True)

        #Dict of used frames
        self.frames = {}

        #Build each used frame, initialize a grid for them
        for F in (GraphPage, ):

            frame = F(container, self)
            self.frames[F] = frame
            frame.grid(row=0, column=0, sticky="nsew")

        #Bring StartPage on top for user
        self.show_frame(GraphPage)

        #Declare class variables used in animation
        self.xList = np.linspace(-30, -1, 30)

        #Prepare the yamnet-format results, 521 classes for 30 seconds
        self.data = np.zeros((521, 30))
        #Prepare the weights used to rank classification results
        self.scores = np.zeros(521)

        #Start audio recording
        self.rec = Recorder(channels=1)
        self.recfile = self.rec.open('sample.wav', 'wb')
        self.recfile.start_recording()
        #After a second, start animating
        self.after(1000, self.animate)
コード例 #25
0
yamnet_params = {
    k: params.__dict__[k]
    for k in params.__dict__ if k == k.upper()
}
for yamnet_param in yamnet_params:
    print(yamnet_param + " = " + str(yamnet_params[yamnet_param]))
print("")

# Load YAMNet.
# We turn the YAMNet model into a two-output model:
# 1. first output is the convnet embedding (task-agnostic)
# 2. second output is the audio event classification (task = AudioSet labels)
tf.get_logger().setLevel('ERROR')
graph = tf.Graph()
with graph.as_default():
    yamnet_model = yamnet.yamnet_frames_model(params)
    yamnet_model_path = os.path.join(yamnet_dir, "yamnet.h5")
    yamnet_model.load_weights(yamnet_model_path)
    yamnet_multi_model = tf.keras.Model(
        inputs=yamnet_model.inputs,
        outputs=[yamnet_model.layers[-4].output, yamnet_model.output])

# Initialize HDF5 folder for prediction
data_dir = os.path.split(sensor_dir)[0]
out_pred_dir = os.path.join(data_dir, "covid_yamnet-pred")
os.makedirs(out_pred_dir, exist_ok=True)
h5_path = os.path.join(out_pred_dir, sonycnode_str + "_yamnet-pred.h5")

# Initialize NPZ folder for features
out_features_dir = os.path.join(out_dir, "covid_yamnet-features")
os.makedirs(out_features_dir, exist_ok=True)
コード例 #26
0
def main(argv):
    assert argv, 'Usage: inference.py <wav file> <wav file> ...'

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

        print('waveform original dtaa', wav_data.shape)

        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')
        print('waveform normal dtaa', waveform.shape)

        print('sampling rate', sr)
        print('sampling rate model params', params.sample_rate)

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            print('entered')
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)

        # plt.figure(figsize=(20, 8))
        # plt.plot(waveform)
        # plt.xlabel('Samples')
        # plt.ylabel('Amplitude')
        # # plt.savefig('waveform.png')
        # plt.show()
        # plt.close()

        print('waveform sample dtaa', waveform.shape)
        # Predict YAMNet classes.
        scores, embeddings, spectrogram = yamnet(waveform)
        print('scores', scores)
        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores, axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))

        truth_labels = [yamnet_classes[i] for i in top5_i]
        print('ground labels', truth_labels)
        total_time = 0

        # plt.figure(figsize=(20, 8))
        # plt.plot(scores[:,282].numpy(),label='water')
        # plt.plot(scores[:,364].numpy(),label='faucet')
        # plt.plot(scores[:,365].numpy(),label='sink')
        # plt.legend()
        # plt.show()
        # plt.close()

        for i in range(len(scores)):
            pred = scores[i]

            water_prob = pred[282].numpy()
            print('water_prob', water_prob)
            top5_i = np.argsort(pred)[::-1][:5]
            print(
                file_name, ':\n' +
                '\n'.join('  {:12s}: {:.3f}'.format(yamnet_classes[i], pred[i])
                          for i in top5_i))

            pred_class = yamnet_classes[top5_i[0]]
            print(pred_class)
            if pred_class in truth_labels:
                total_time += 0.96

        print('total time', total_time / 2)
コード例 #27
0
def main(argv):

    params = yamnet_params.Params()
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')
    yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

    for file_name in argv:
        # Decode the WAV file.
        wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

        print('waveform original dtaa', wav_data.shape)

        waveform = wav_data / 32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')
        print('waveform normal dtaa', waveform.shape)

        print('sampling rate', sr)
        print('sampling rate model params', params.sample_rate)

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            print('entered')
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)
        print('waveform normal dtaa', waveform.shape)

        scale = 2.5

        # fig = plt.figure(figsize=(int(scale*4), int(scale*3)))
        # camera = Camera(fig)

        # for i in range(0,len(waveform),int(0.96*params.sample_rate/int(8))):
        # 	plt.plot(waveform[:i],color='b')
        # 	plt.xlabel('Samples')
        # 	plt.ylabel('Amplitude')
        # 	camera.snap()
        # animation = camera.animate()
        # animation.save(file_name+'_filename_'+str(scale)+'.mp4')
        # plt.close()

        # Predict YAMNet classes.
        scores, embeddings, spectrogram = yamnet(waveform)
        print('scores', scores)
        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores, axis=0)
        # Report the highest-scoring classes and their scores.
        top5_i = np.argsort(prediction)[::-1][:5]
        print(
            file_name, ':\n' + '\n'.join(
                '  {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i])
                for i in top5_i))

        # colors=['b','g','r']
        # fig=plt.figure()
        # camera = Camera(fig)
        # plt.xlabel('Time(0.5s)')
        # plt.ylabel('Probability')
        # for j in range(1,len(scores)):
        # 	k=0
        # 	for i in top5_i[1:-1]:

        # 		x=np.convolve(scores[:j,i].numpy(), np.ones((4,))/4, mode='valid')
        # 		# x=scores[:j,i].numpy()
        # 		plt.plot(x,color=colors[k])
        # 		k+=1
        # 	for i in range(1):

        # 		camera.snap()
        # plt.legend([yamnet_classes[i] for i in top5_i[1:-1]],loc='upper right')
        # animation = camera.animate(interval=int(1000))

        # # plt.show()
        # # plt.close()
        # animation.save(file_name+'_class_'+str(scale)+'.mp4')

        colors = ['b', 'g', 'r']
        fig = plt.figure()
        camera = Camera(fig)
        plt.xlabel('Time(0.5s)')
        plt.ylabel('volume')
        vol_store = []
        total_vol = 0
        for j in range(len(scores)):

            vol = []
            for i in top5_i[1:-1]:

                # x=np.convolve(scores[j,i].numpy(), np.ones((4,))/4, mode='valid')
                x = scores[j, i].numpy()
                if x > 0.1:
                    vol.append(float(1 / 24))
            # print(vol)
            if vol:
                total_vol += np.mean(vol)
            print(total_vol)
            vol_store.append(total_vol)
            # print(vol_store)
            plt.plot(vol_store, color='b')
            camera.snap()
        # plt.legend(,loc='upper right')
        animation = camera.animate(interval=int(1000))

        # plt.show()
        # plt.close()
        animation.save(file_name + '_volume_' + str(scale) + '.mp4')