def main(argv): assert argv graph = tf.Graph() with graph.as_default(): yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) # Predict YAMNet classes. # Second output is log-mel-spectrogram array (used for visualizations). # (steps=1 is a work around for Keras batching limitations.) with graph.as_default(): scores, _ = yamnet.predict(np.reshape(waveform, [1, -1]), steps=1) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(file_name, ':\n' + '\n'.join(' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i))
def setUpClass(cls): super(YAMNetTest, cls).setUpClass() cls._yamnet_graph = tf.Graph() with cls._yamnet_graph.as_default(): cls._yamnet = yamnet.yamnet_frames_model(params) cls._yamnet.load_weights('yamnet.h5') cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
def embedding(self, input_paths, output_paths, embed_paths=""): """Extract YAMnet features with opensmile using a single process.""" if embed_paths == "": embed_paths = [""] * len(input_paths) save_embedding = False else: save_embedding = True paths = list(zip(input_paths, embed_paths, output_paths)) params = yamnet_params.Params(sample_rate=self.sample_rate, patch_hop_seconds=0.48) class_names = yamnet_model.class_names(self.class_names) yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights(self.model_checkpoint) func = partial( self._embed, yamnet=yamnet, params=params, class_names=class_names, save_embedding=save_embedding, ) self.single_process(func, paths)
def yamnet_grad_test(): waveform = np.reshape( np.sin(2 * np.pi * 440 * np.linspace(0, 3, num=int(3 * 16000))), [1, -1]) print(waveform[0]) wavfile.write('sine.wav', 16000, waveform[0]) model = yamnet_frames_model(params) model.load_weights('yamnet.h5') classes = class_names('yamnet_class_map.csv') with tf.GradientTape() as grad_tape: audio_tensor = tf.convert_to_tensor(np.reshape(waveform, [1, -1])) print(f'Audio Tensor is: {type(audio_tensor)}') grad_tape.watch(audio_tensor) # scores, spectrograms = model.predict(audio_tensor, steps=1) scores, spectrograms = model(audio_tensor) print(f'Scores is: {type(scores)}') target_scores = scores.numpy() assert target_scores.shape == scores.shape target_scores[:, 0] = 1 target_scores = tf.convert_to_tensor(target_scores) loss = tf.keras.losses.MSE(target_scores, scores) gradient_tensor = grad_tape.gradient(loss, audio_tensor) print(scores[0]) print(classes[np.argsort(scores[0])[-3:]]) print(gradient_tensor.shape) print(audio_tensor.shape) output_tensor = audio_tensor + 1000 * gradient_tensor wavfile.write('speechy.wav', 16000, output_tensor[0].numpy()) wavfile.write('grad.wav', 16000, 1000 * gradient_tensor[0].numpy())
def main(argv): assert argv, 'Usage: inference.py <wav file> <wav file> ...' params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i))
def main(): # Load the model and weights model = yamnet.yamnet_frames_model(params) model.load_weights('yamnet.h5') # Convert the model converter = tf.lite.TFLiteConverter.from_keras_model(model) tflite_model = converter.convert() open("yamnet.tflite", "wb").write(tflite_model)
def convert_general(general_model): #general = load_model(general_model) model_general = yamnet_frames_model(params) model_general.load_weights( '/home/pc/PycharmProjects/yamnet_medium/output/yamnet.tflite', by_name=True) print(model_general.summary()) converter = tf.lite.TFLiteConverter.from_keras_model(model_general) tflite_model = converter.convert() open("general_model.tflite", "wb").write(tflite_model)
def __init__(self): physical_devices = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_virtual_device_configuration( physical_devices[0], [ tf.config.experimental.VirtualDeviceConfiguration( memory_limit=4096) ]) self.graph = tf.Graph() with self.graph.as_default(): self.yamnet = yamnet_model.yamnet_frames_model(params) self.yamnet.load_weights('yamnet/yamnet.h5') self.yamnet_classes = yamnet_model.class_names( 'yamnet/yamnet_class_map.csv')
def get_model(): # Build network yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5', by_name=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) yamnet.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=[ 'accuracy', keras.metrics.Precision(), keras.metrics.Recall() ]) return yamnet
def __init__(self, config_path="./config.yaml"): """Init method for the Searcher.""" super().__init__() # Load the configuration conf = OmegaConf.load(config_path) self.dataset_path = conf.dataset_path self.audio_path = os.path.join(conf.dataset_path, "podcasts-audio") self.es_url = conf.search_es_url # URL of Elasticsearch to query self.es_num = (conf.search_es_num ) # Number of segments to request from Elasticsearch self.sample_rate = 44100 # Hardcoded sample rate of all podcast audio # Load the podcast metadata self.metadata = load_metadata(self.dataset_path) # Set up the reranking model self.rerank_tokenizer = AutoTokenizer.from_pretrained( conf.search_rerank_model, use_fast=True, cache_dir=conf.search_cache_dir) self.rerank_model = AutoModelForSequenceClassification.from_pretrained( conf.search_rerank_model, cache_dir=conf.search_cache_dir) self.rerank_model.to("cpu", non_blocking=True) self.rerank_max_seq_len = 512 # Set up the openSMILE extractor self.smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.Functionals, options={ "frameModeFunctionalsConf": os.path.join( os.getenv("PODCAST_PATH"), "data/custom_FrameModeFunctionals.conf.inc", ) }, ) # Set up the YAMNet model params = yamnet_params.Params(sample_rate=self.sample_rate, patch_hop_seconds=0.48) self.yamnet_classes = yamnet_model.class_names( os.path.join(os.getenv("YAMNET_PATH"), "yamnet_class_map.csv")) self.yamnet_model = yamnet_model.yamnet_frames_model(params) self.yamnet_model.load_weights( os.path.join(os.getenv("PODCAST_PATH"), "data/yamnet.h5"))
def main(argv): assert argv, 'Usage: inference.py <wav file> <wav file> ...' model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet.h5') classes_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet_class_map.csv') event_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'event.json') params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights(model_path) yamnet_classes = yamnet_model.class_names(classes_path) for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(file_name, ':\n' + '\n'.join(' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) # print all classes b = prediction.tolist() # nested lists with same data, indices pred = [] for (i,cls) in enumerate(yamnet_classes): item={} item['label']=cls item['value']=round(b[i], 6) pred.append(item) pred = sorted(pred, key=lambda x: x['value'], reverse=True) json.dump(pred, codecs.open(event_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4) ### this saves the array in .json format
def create_dataset(path): samples, labels = [], [] model = yamnet_frames_model(Params()) model.load_weights(YAMNET_PATH) for cls in os.listdir(path): for sound in tqdm(os.listdir(os.path.join(path, cls))): wav = librosa.load(os.path.join(os.path.join(path, cls, sound)), sr=16000)[0].astype(np.float32) #Here you can add preprocessing, augmentations, silence removal, etc. for feature in model(wav)[1]: samples.append(feature) labels.append(cls) samples = np.asarray(samples) labels = np.asarray(labels) return samples, labels
def main(argv): global analysisdata, frame_counter log = open('/tmp/sound.log', 'w') # Set up yamnet params = yamnet_params.Params(sample_rate=ANALYSIS_SAMPLE_RATE, patch_hop_seconds=0.1) yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('/home/pi/models/research/audioset/yamnet/yamnet.h5') yamnet_classes = yamnet_model.class_names( '/home/pi/models/research/audioset/yamnet/yamnet_class_map.csv') # Set up a live callback stream from the microphone stream = sd.InputStream(device=1, channels=1, samplerate=RECORD_SAMPLE_RATE, callback=audio_callback, blocksize=BUFFER_SIZE_F) with stream: while True: update_analysis_window() if (frame_counter >= int( ANALYSIS_LENGTH_S * ANALYSIS_SAMPLE_RATE)): frame_counter = 0 scores = yamnet.predict(analysisdata, steps=1)[0] if (len(scores)): prediction = np.mean(scores, axis=0) top5_i = np.argsort(prediction)[::-1][:1] for x in top5_i: if (prediction[x] > THRESHOLD): top_class_str = yamnet_classes[x] # Write any detected class (outside these noisy ones) to the log if (not top_class_str in [ "Fireworks", "Silence", "Inside, small room" ]): log.write("[%s] %s %0.4f\n" % (datetime.now().strftime( "%m/%d/%Y %H:%M:%S"), top_class_str, prediction[x])) log.flush() # And if it's one of the doorbell ones, ping the homebridge server if (top_class_str in [ "Beep, bleep", "Doorbell", "Glass", "Ding" ]): trigger_homekit_motion()
def main(): # Load yamnet yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') # Convert the model class_names = [ re.sub(r'\ |\(|\)|,|-|\'', '', x.lower()) for x in yamnet_model.class_names('yamnet_class_map.csv') ] frame = RfcxFrame(yamnet, params.SAMPLE_RATE, params.PATCH_WINDOW_SECONDS, class_names, 'pcm_s16le') tf.saved_model.save(frame, 'model', signatures={ "score": frame.score, "metadata": frame.metadata })
def load_model(self, layer=None): """ This function loads the yamnet model with a specified layer and returns a 'dreamer' model that returns the activations of such layer Parameters ----------- layer (string) : a specified layer If `layer` is not specified, the last layer is used instead. Returns ---------- (tf.keras.Model) : the dreamer model """ # load its class names self.class_names = yamnet.class_names(self.class_file) self.class_names_tensor = tf.constant(self.class_names) # load model parameters and get model self.params = params.Params(sample_rate=self.sr, patch_hop_seconds=self.patch_hop) self.model = yamnet.yamnet_frames_model(self.params) # load model weigths self.model.load_weights(self.weights_file) if layer is not None: self.layername = layer else: self.__print__("Using last layer.") self.layername = self.model.layers[-1].name self.__print__(f"Yamnet loaded, using layer:{self.layername}") # Get the specified layer self.layers = self.model.get_layer(self.layername).output # Finally, create the dreamer model self.dreamer = tf.keras.Model(inputs=self.model.input, outputs=self.layers) self.__print__("Dreamer started.") return self.dreamer
def classification(argv): yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') #yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] sr = 44100 # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) # Predict YAMNet classes. # Second output is log-mel-spectrogram array (used for visualizations). # (steps=1 is a work around for Keras batching limitations.) scores, _ = yamnet.predict(np.reshape(waveform, [1, -1]), steps=1) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. #sound_events = np.argsort(prediction)[::-1] return prediction
def main(argv): params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype print('waveform original dtaa', wav_data.shape) waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') print('waveform normal dtaa', waveform.shape) print('sampling rate', sr) print('sampling rate model params', params.sample_rate) # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: print('entered') waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) print(waveform.shape, min(waveform)) # plt.figure(figsize=(20, 8)) # plt.plot(waveform) # plt.xlabel('Samples') # plt.ylabel('Amplitude') # # plt.savefig('waveform.png') # plt.show() # plt.close() # fig, ax = plt.subplots(figsize=(20, 8)) fig = plt.figure() ax = plt.axes(xlim=(0, len(waveform)), ylim=(-0.16, 0.17)) line, = ax.plot([], [], lw=1) def init(): line.set_data([], []) return line, def animate(i): x = np.linspace(0, len(waveform), len(waveform)) y = waveform[i] line.set_data(x, y) return line, anim = FuncAnimation(fig, animate, init_func=init, frames=200, interval=20, blit=True) plt.draw() plt.show()
def __init__(self, weights_path, params): super().__init__() self._yamnet = yamnet.yamnet_frames_model(params) self._yamnet.load_weights(weights_path) self._class_map_asset = tf.saved_model.Asset('yamnet_class_map.csv')
boilKeys = [ 'Boiling', 'Liquid', 'Water', ] # 'Water', 'Pour', 'Drip' waterKeys = ['Water tap, faucet', 'Sink (filling or washing)'] signals = dict.fromkeys(keys, 0.0) picked = dict.fromkeys(keys, 0.0) detected = dict.fromkeys(keys, False) detectThreshold = 0.65 checkThreshold = 0.25 resetThreshold = 0.05 # Set up the YAMNet model. params.PATCH_HOP_SECONDS = 0.48 # 10 Hz scores frame rate. //0.1 yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') class_names = yamnet_model.class_names('yamnet_class_map.csv') CHUNKSIZE = 16000 # fixed chunk size sr = 16000 seconds = 1 predictionPeriod = 2.0 predictionRate = 2.0 predChunkSize = int(sr * predictionPeriod) readChunkSize = int(sr * predictionRate) duration = 50 frames = [] last5secFrames = []
def classify_audio(audio_device_index, interpreter, labels_file, commands_file=None, result_callback=None, dectection_callback=None, sample_rate_hz=16000, negative_threshold=0.6, num_frames_hop=33): """Acquire audio, preprocess, and classify.""" # Initialize recorder. AUDIO_SAMPLE_RATE_HZ = sample_rate_hz downsample_factor = 1 if AUDIO_SAMPLE_RATE_HZ == 48000: downsample_factor = 3 # Most microphones support this # Because the model expects 16KHz audio, we downsample 3 fold recorder = audio_recorder.AudioRecorder( AUDIO_SAMPLE_RATE_HZ, downsample_factor=downsample_factor, device_index=audio_device_index) feature_extractor = Uint8LogMelFeatureExtractor( num_frames_hop=num_frames_hop) labels = read_labels(labels_file) if commands_file: commands = read_commands(commands_file) else: commands = {} logger.info("Loaded commands: %s", str(commands)) logger.info("Recording") timed_out = False # Testing if False: sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav' import tensorflow as tf import os def decode_audio(audio_binary): audio, _ = tf.audio.decode_wav(audio_binary) return tf.squeeze(audio, axis=-1) def get_label(file_path): parts = tf.strings.split(file_path, os.path.sep) # Note: You'll use indexing here instead of tuple unpacking to enable this # to work in a TensorFlow graph. return parts[-2] def get_waveform_and_label(file_path): label = get_label(file_path) audio_binary = tf.io.read_file(file_path) waveform = decode_audio(audio_binary) return waveform, label waveform, label = get_waveform_and_label(sample_data) print(waveform.shape) # End Testing # yamnet start testing import os import soundfile as sf import params as yamnet_params import yamnet as yamnet_model from scipy.io import wavfile params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) if not os.path.exists('yamnet.h5'): print( 'Error: curl -O https://storage.googleapis.com/audioset/yamnet.h5') exit() yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') import pygame pygame.init() screen = pygame.display.set_mode((640, 480)) font_header = pygame.font.Font(pygame.font.get_default_font(), 36) font = pygame.font.Font(pygame.font.get_default_font(), 36 * 2) text_surface = font.render('Hello world', True, (0, 0, 0)) GRAY = (200, 200, 200) # yamnet end testing with recorder: last_detection = -1 while not timed_out: audio_sample = recorder.get_audio(7921)[0] if False: wavfile.write('test.wav', 16000, audio_sample) wav_data, sr = sf.read('test.wav', dtype=np.int16) else: wav_data = np.array(audio_sample, dtype=np.int16) sr = AUDIO_SAMPLE_RATE_HZ assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) print('-------') # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) print('{}:{:.3f}'.format(yamnet_classes[42], prediction[42])) print('{}:{:.3f}'.format(yamnet_classes[0], prediction[0])) print('{}:{:.3f}'.format(yamnet_classes[494], prediction[494])) target_predictions = prediction[42], prediction[0], prediction[494] target_classes = yamnet_classes[42], yamnet_classes[ 0], yamnet_classes[494] index = np.argsort(target_predictions)[::-1][0] black = (0, 0, 0) green = (0, 255, 0) red = (255, 0, 0) if index == 0: color = red elif index == 1: color = green else: color = black text1 = font.render(target_classes[index], True, color) header1 = font_header.render('R-zero Device Listening for Audio', True, (0, 0, 0)) screen.fill(GRAY) screen.blit(header1, dest=(20, 100)) screen.blit(text1, dest=(200, 200)) pygame.display.update() ''' line = '{}:{:.3f}'.format(yamnet_classes[42], prediction[42]) label = Tk.Label(None, text=line, font=('Times', '18'), fg='blue') label.pack() label.mainloop() ''' # End """
def main(): EPOCHS = 1000 f_X_train = 0 f_y_train = 1 f_X_val = 2 f_y_val = 3 # General log variables accuracy_train_scores, accuracy_validation_scores, accuracy_test_scores = [], [], [] precision_train_scores, precision_validation_scores, precision_test_scores = [], [], [] recall_train_scores, recall_validation_scores, recall_test_scores = [], [], [] train_error, validation_error, test_error = [], [], [] # Log variables for each class accuracy_train_per_class, accuracy_validation_per_class, accuracy_test_per_class = {}, {}, {} precision_train_per_class, precision_validation_per_class, precision_test_per_class = {}, {}, {} recall_train_per_class, recall_validation_per_class, recall_test_per_class = {}, {}, {} f1_score_train_per_class, f1_score_validation_per_class, f1_score_test_per_class = {}, {}, {} # Initialize dictionaries for each metric accuracy_train_per_class, accuracy_validation_per_class, accuracy_test_per_class = util.initialize_metrics_per_class( classes, accuracy_train_per_class, accuracy_validation_per_class, accuracy_test_per_class) precision_train_per_class, precision_validation_per_class, precision_test_per_class = util.initialize_metrics_per_class( classes, precision_train_per_class, precision_validation_per_class, precision_test_per_class) recall_train_per_class, recall_validation_per_class, recall_test_per_class = util.initialize_metrics_per_class( classes, recall_train_per_class, recall_validation_per_class, recall_test_per_class) f1_score_train_per_class, f1_score_validation_per_class, f1_score_test_per_class = util.initialize_metrics_per_class( classes, f1_score_train_per_class, f1_score_validation_per_class, f1_score_test_per_class) all_files = util.get_files_path()[4:] # Build network yamnet = yamnet_model.yamnet_frames_model(params, fine_tuning=False) yamnet.load_weights('yamnet.h5') get_feature_layer_output = K.function([yamnet.layers[0].input], [yamnet.layers[-3].output]) waveforms = {} labels = [] for file in all_files: # Decode the WAV file. wav_data, sr = sf.read(file, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] label = file.split('\\')[-2][-1] label = labels_dict[label] # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) avg = len(waveform) / float(5) last = 0.0 waveforms[label] = [] while last < len(waveform): waveforms[label].append(waveform[int(last):int(last + avg)]) labels.append(label) last += avg folds, X_test, Y_test = util.build_folds_test(waveforms, labels, classes) X_T = [] Y_T = [] for x, y in zip(X_test, Y_test): a = get_feature_layer_output([np.reshape(x, [1, -1])])[0] for i in a: X_T.append(i) Y_T.append(y) X_T = np.array(X_T) Y_T = np.array(Y_T) Y_T = to_categorical(Y_T) count = 1 for fold in folds: print("Fold %d:\n" % count) X = [] X_V = [] Y = [] Y_V = [] for x, y in zip(fold[f_X_train], fold[f_y_train]): a = get_feature_layer_output([np.reshape(x, [1, -1])])[0] for i in a: X.append(i) Y.append(y) for x, y in zip(fold[f_X_val], fold[f_y_val]): v = get_feature_layer_output([np.reshape(x, [1, -1])])[0] for i in v: X_V.append(i) Y_V.append(y) model = get_model() X = np.array(X) Y = np.array(Y) Y = to_categorical(Y) X_V = np.array(X_V) Y_V = np.array(Y_V) Y_V = to_categorical(Y_V) # Train and Validation #callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3) history = model.fit(X, Y, epochs=EPOCHS, batch_size=32, validation_data=(X_V, Y_V), verbose=False) #, callbacks=[callback]) # Predict values y_pred = model.predict(X) # Get precision, recall and F1-score y_true = np.argmax(Y, axis=1) y_pred = np.argmax(y_pred, axis=1) report = classification_report(y_true, y_pred, output_dict=True) accuracy = util.per_class_accuracy(y_pred, y_true, classes) for c in classes: accuracy_train_per_class[c].append(accuracy[c]) f1_score_train_per_class[c].append(report[str(c)]['f1-score']) precision_train_per_class[c].append(report[str(c)]['precision']) recall_train_per_class[c].append(report[str(c)]['recall']) # Save train and validation accuracy accuracy_train_scores.append(history.history['accuracy']) accuracy_validation_scores.append(history.history['val_accuracy']) # Save train and validation precision precision_train_scores.append(history.history['precision_' + str(count)]) precision_validation_scores.append(history.history['val_precision_' + str(count)]) # Save train and validation recall recall_train_scores.append(history.history['recall_' + str(count)]) recall_validation_scores.append(history.history['val_recall_' + str(count)]) # Save train and validation error train_error.append(history.history['loss']) validation_error.append(history.history['val_loss']) y_pred = model.predict(X_V) y_true = np.argmax(Y_V, axis=1) y_pred = np.argmax(y_pred, axis=1) report = classification_report(y_true, y_pred, output_dict=True) accuracy = util.per_class_accuracy(y_pred, y_true, classes) for c in classes: accuracy_validation_per_class[c].append(accuracy[c]) precision_validation_per_class[c].append( report[str(c)]['precision']) recall_validation_per_class[c].append(report[str(c)]['recall']) f1_score_validation_per_class[c].append(report[str(c)]['f1-score']) score = model.evaluate(X_T, Y_T) # Save error, accuracy and precision test_error.append(score[0]) accuracy_test_scores.append(score[1]) precision_test_scores.append(score[2]) recall_test_scores.append(score[3]) #print("Training accuracy: %.2f%%" % (history.history['accuracy'][-1]*100)) #print("Testing accuracy: %.2f%%" % (history.history['val_accuracy'][-1]*100)) count += 1 y_pred = model.predict(X_T) y_true = np.argmax(Y_T, axis=1) y_pred = np.argmax(y_pred, axis=1) report = classification_report(y_true, y_pred, output_dict=True) accuracy = util.per_class_accuracy(y_pred, y_true, classes) for c in classes: accuracy_test_per_class[c].append(accuracy[c]) f1_score_test_per_class[c].append(report[str(c)]['f1-score']) precision_test_per_class[c].append(report[str(c)]['precision']) recall_test_per_class[c].append(report[str(c)]['recall']) print("Training information") util.print_mean(classes, accuracy_train_per_class, f1_score_train_per_class, precision_train_per_class, recall_train_per_class) util.print_std(classes, accuracy_train_per_class, f1_score_train_per_class, precision_train_per_class, recall_train_per_class) print("Validation information") util.print_mean(classes, accuracy_validation_per_class, f1_score_validation_per_class, precision_validation_per_class, recall_validation_per_class) util.print_std(classes, accuracy_validation_per_class, f1_score_validation_per_class, precision_validation_per_class, recall_validation_per_class) print("Test information") util.print_mean(classes, accuracy_test_per_class, f1_score_test_per_class, precision_test_per_class, recall_test_per_class) util.print_std(classes, accuracy_test_per_class, f1_score_test_per_class, precision_test_per_class, recall_test_per_class) plt.plot(accuracy_train_scores, accuracy_validation_scores, EPOCHS, "Treinamento", "Validação", "Acurácia") plt.plot(precision_train_scores, precision_validation_scores, EPOCHS, "Treinamento", "Validação", "Precisão") plt.plot(recall_train_scores, recall_validation_scores, EPOCHS, "Treinamento", "Validação", "Recall") #TODO: fix loss plot #plt.plot_loss(losses, val_losses, epochs) for c in classes: util.save_to_file_per_class( accuracy_train_per_class[c], accuracy_validation_per_class[c], precision_train_per_class[c], precision_validation_per_class[c], recall_train_per_class[c], recall_validation_per_class[c], accuracy_test_per_class[c], precision_test_per_class[c], recall_test_per_class[c], "logs_per_class_" + str(c) + ".txt") util.save_to_file(accuracy_train_scores, accuracy_validation_scores, precision_train_scores, precision_validation_scores, recall_train_scores, recall_validation_scores, accuracy_test_scores, precision_test_scores, recall_test_scores, train_error, validation_error, test_error, "logs.txt") return
def setUpClass(cls): super().setUpClass() cls._params = params.Params() cls._yamnet = yamnet.yamnet_frames_model(cls._params) cls._yamnet.load_weights('yamnet.h5') cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
def load_model(): params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') return yamnet, yamnet_classes
def __init__(self, *args, **kwargs): #Constructor, builds the tkinter app and used frames #Run the base class init tk.Tk.__init__(self, *args, **kwargs) """TODO Initialize Yamnet""" self.yamnet = yamnet_model.yamnet_frames_model(params) self.yamnet.load_weights('yamnet.h5') #Prepare the visualization graph. Tight layout for fitting better self.figure, self.axs = plt.subplots(10, figsize=(10, 10)) plt.tight_layout() #Prepare colors. Colors are xkcd-colors in random order with open('colors.txt', 'r') as colorfile: self.colors = colorfile.readlines() #Strip newlines for more efficient use for i in range(len(self.colors)): self.colors[i] = self.colors[i][:-1] #Prepare Yamnet class names with open('classes.txt', 'r') as classesfile: self.classes = classesfile.readlines() #Strip newlines for more efficient use for i in range(len(self.classes)): self.classes[i] = self.classes[i][:-1] #Base frame to build the used frames from container = tk.Frame(self) container.pack(side="top", fill="both", expand=True) #Dict of used frames self.frames = {} #Build each used frame, initialize a grid for them for F in (GraphPage, ): frame = F(container, self) self.frames[F] = frame frame.grid(row=0, column=0, sticky="nsew") #Bring StartPage on top for user self.show_frame(GraphPage) #Declare class variables used in animation self.xList = np.linspace(-30, -1, 30) #Prepare the yamnet-format results, 521 classes for 30 seconds self.data = np.zeros((521, 30)) #Prepare the weights used to rank classification results self.scores = np.zeros(521) #Start audio recording self.rec = Recorder(channels=1) self.recfile = self.rec.open('sample.wav', 'wb') self.recfile.start_recording() #After a second, start animating self.after(1000, self.animate)
yamnet_params = { k: params.__dict__[k] for k in params.__dict__ if k == k.upper() } for yamnet_param in yamnet_params: print(yamnet_param + " = " + str(yamnet_params[yamnet_param])) print("") # Load YAMNet. # We turn the YAMNet model into a two-output model: # 1. first output is the convnet embedding (task-agnostic) # 2. second output is the audio event classification (task = AudioSet labels) tf.get_logger().setLevel('ERROR') graph = tf.Graph() with graph.as_default(): yamnet_model = yamnet.yamnet_frames_model(params) yamnet_model_path = os.path.join(yamnet_dir, "yamnet.h5") yamnet_model.load_weights(yamnet_model_path) yamnet_multi_model = tf.keras.Model( inputs=yamnet_model.inputs, outputs=[yamnet_model.layers[-4].output, yamnet_model.output]) # Initialize HDF5 folder for prediction data_dir = os.path.split(sensor_dir)[0] out_pred_dir = os.path.join(data_dir, "covid_yamnet-pred") os.makedirs(out_pred_dir, exist_ok=True) h5_path = os.path.join(out_pred_dir, sonycnode_str + "_yamnet-pred.h5") # Initialize NPZ folder for features out_features_dir = os.path.join(out_dir, "covid_yamnet-features") os.makedirs(out_features_dir, exist_ok=True)
def main(argv): assert argv, 'Usage: inference.py <wav file> <wav file> ...' params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype print('waveform original dtaa', wav_data.shape) waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') print('waveform normal dtaa', waveform.shape) print('sampling rate', sr) print('sampling rate model params', params.sample_rate) # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: print('entered') waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) # plt.figure(figsize=(20, 8)) # plt.plot(waveform) # plt.xlabel('Samples') # plt.ylabel('Amplitude') # # plt.savefig('waveform.png') # plt.show() # plt.close() print('waveform sample dtaa', waveform.shape) # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) print('scores', scores) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) truth_labels = [yamnet_classes[i] for i in top5_i] print('ground labels', truth_labels) total_time = 0 # plt.figure(figsize=(20, 8)) # plt.plot(scores[:,282].numpy(),label='water') # plt.plot(scores[:,364].numpy(),label='faucet') # plt.plot(scores[:,365].numpy(),label='sink') # plt.legend() # plt.show() # plt.close() for i in range(len(scores)): pred = scores[i] water_prob = pred[282].numpy() print('water_prob', water_prob) top5_i = np.argsort(pred)[::-1][:5] print( file_name, ':\n' + '\n'.join(' {:12s}: {:.3f}'.format(yamnet_classes[i], pred[i]) for i in top5_i)) pred_class = yamnet_classes[top5_i[0]] print(pred_class) if pred_class in truth_labels: total_time += 0.96 print('total time', total_time / 2)
def main(argv): params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype print('waveform original dtaa', wav_data.shape) waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') print('waveform normal dtaa', waveform.shape) print('sampling rate', sr) print('sampling rate model params', params.sample_rate) # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: print('entered') waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) print('waveform normal dtaa', waveform.shape) scale = 2.5 # fig = plt.figure(figsize=(int(scale*4), int(scale*3))) # camera = Camera(fig) # for i in range(0,len(waveform),int(0.96*params.sample_rate/int(8))): # plt.plot(waveform[:i],color='b') # plt.xlabel('Samples') # plt.ylabel('Amplitude') # camera.snap() # animation = camera.animate() # animation.save(file_name+'_filename_'+str(scale)+'.mp4') # plt.close() # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) print('scores', scores) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) # colors=['b','g','r'] # fig=plt.figure() # camera = Camera(fig) # plt.xlabel('Time(0.5s)') # plt.ylabel('Probability') # for j in range(1,len(scores)): # k=0 # for i in top5_i[1:-1]: # x=np.convolve(scores[:j,i].numpy(), np.ones((4,))/4, mode='valid') # # x=scores[:j,i].numpy() # plt.plot(x,color=colors[k]) # k+=1 # for i in range(1): # camera.snap() # plt.legend([yamnet_classes[i] for i in top5_i[1:-1]],loc='upper right') # animation = camera.animate(interval=int(1000)) # # plt.show() # # plt.close() # animation.save(file_name+'_class_'+str(scale)+'.mp4') colors = ['b', 'g', 'r'] fig = plt.figure() camera = Camera(fig) plt.xlabel('Time(0.5s)') plt.ylabel('volume') vol_store = [] total_vol = 0 for j in range(len(scores)): vol = [] for i in top5_i[1:-1]: # x=np.convolve(scores[j,i].numpy(), np.ones((4,))/4, mode='valid') x = scores[j, i].numpy() if x > 0.1: vol.append(float(1 / 24)) # print(vol) if vol: total_vol += np.mean(vol) print(total_vol) vol_store.append(total_vol) # print(vol_store) plt.plot(vol_store, color='b') camera.snap() # plt.legend(,loc='upper right') animation = camera.animate(interval=int(1000)) # plt.show() # plt.close() animation.save(file_name + '_volume_' + str(scale) + '.mp4')