def start_playing(self, filename): """ Play a wav file, and classify the audio. Note we use a background thread to read the wav file and we setup a UI animation function to draw the sliding spectrogram image, this way the UI update doesn't interfere with the smoothness of the audio playback """ if self.speaker is None: self.speaker = speaker.Speaker() self.stop() self.reading_input = False self.wav_file = wav_reader.WavReader(self.sample_rate, self.channels, self.auto_scale) self.wav_file.open(filename, self.featurizer.input_size, self.speaker) def update_func(frame_index): return self.on_ui_update() if self.animation: self.animation.event_source.stop() self.reading_input = True # Start animation timer for updating the UI (e.g. spectrogram image) self.animation = self.spectrogram_widget.begin_animation(update_func) # start background thread to read and classify the audio. self.featurizer.open(self.wav_file) self.read_input_thread = Thread(target=self.on_read_features, args=()) self.read_input_thread.daemon = True self.read_input_thread.start()
def start_playing(self, filename): """ Play a wav file, and classify the audio. Note we use a background thread to read the wav file and we setup a UI animation function to draw the sliding spectrogram image, this way the UI update doesn't interfere with the smoothness of the audio playback """ if self.speaker is None: self.speaker = speaker.Speaker() self.stop() self.reading_input = False self.wav_file = wav_reader.WavReader(self.sample_rate, self.channels) self.wav_file.open(filename, self.featurizer.input_size, self.speaker) def update_func(frame_index): self.process_output() if not self.reading_input: self.after(1, self.on_stopped) self.set_spectrogram_image() return (self.spectrogram_image,) if self.animation: self.animation.event_source.stop() self.reading_input = True # Start animation timer for updating the UI (e.g. spectrogram image) (30 fps is usually fine) self.animation = animation.FuncAnimation(self.features_figure, update_func, interval=33, blit=True) # start background thread to read and classify the audio. self.featurizer.open(self.wav_file) self.read_input_thread = Thread(target=self.on_read_features, args=()) self.read_input_thread.daemon = True self.read_input_thread.start()
def get_wav_features(input_filename, transform, sample_rate, window_size, shift): """ Transform the given .wav input file into a set of features given the required sample rate window size and shift. The window size is the number of features we need to give the classifier and the shift is the amount by which that window slides as new transformed features are added. """ transform_input_size = transform.input_size transform_output_size = transform.output_size channels = 1 # we only do mono audio right now... source = wav_reader.WavReader(sample_rate, channels) source.open(input_filename, transform_input_size) # apply the featurizing transform transform.open(source) source = lazy_apply_transform(transform) # and apply the classifier window frame size source = sliding_window_frame(source, window_size, shift) rows_generated = 0 for row in source: features = np.ravel(row) rows_generated += 1 yield features
def play_sound(wavfile): import speaker import wav_reader reader = wav_reader.WavReader() reader.open(wavfile, 512, speaker.Speaker()) while True: buffer = reader.read() if buffer is None: break
def open_noise(self): self.mix = True self.count = 1 if self.noise_reader is None: buffer_size = self.wav_reader.buffer_size self.noise_reader = wav_reader.WavReader(self.requested_rate, self.requested_channels) self.noise_reader.open(self.noise_files[self.noise_index], buffer_size) self.noise_index += 1 if self.noise_index == len(self.noise_files): self.noise_index = 0
def test_keyword_spotter(featurizer_model, classifier_model, categories, wav_files, threshold, sample_rate, output_speaker=False, auto_scale=False, reset=False): predictor = classifier.AudioClassifier(classifier_model, categories, threshold, SMOOTHING) transform = featurizer.AudioTransform(featurizer_model, predictor.input_size) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") the_speaker = None if output_speaker: the_speaker = speaker.Speaker() results = [] if wav_files: if not os.path.isdir(wav_files): raise Exception("--wav_files {} dir not found".format(wav_files)) file_list = os.listdir(wav_files) file_list.sort() for filename in file_list: ext = os.path.splitext(filename)[1] if ext != ".wav": print("Skipping non-wav file: ", filename) else: reader = wav_reader.WavReader(sample_rate, CHANNELS, auto_scale) path = os.path.join(wav_files, filename) print("opening ", path) reader.open(path, transform.input_size, the_speaker) result = get_prediction(reader, transform, predictor, categories) results += [result] if reset: predictor.reset() else: reader = microphone.Microphone(True, True) reader.open(transform.input_size, sample_rate, CHANNELS) print("Please type 'x' and enter to terminate this app...") result = get_prediction(reader, transform, predictor, categories) results += [result] return results
def test_keyword_spotter(featurizer_model, classifier_model, categories, wav_file, threshold, sample_rate, output_speaker=False): predictor = classifier.AudioClassifier(classifier_model, categories, threshold, SMOOTHING) transform = featurizer.AudioTransform(featurizer_model, predictor.input_size) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") # set up inputs and outputs if wav_file: the_speaker = None if output_speaker: the_speaker = speaker.Speaker() reader = wav_reader.WavReader(sample_rate, CHANNELS) reader.open(wav_file, transform.input_size, the_speaker) else: reader = microphone.Microphone(True) reader.open(transform.input_size, sample_rate, CHANNELS) print("Please type 'x' and enter to terminate this app...") transform.open(reader) results = None try: while True: feature_data = transform.read() if feature_data is None: break else: prediction, probability, label = predictor.predict(feature_data) if probability is not None: if not results or results[1] < probability: results = (prediction, probability, label) percent = int(100 * probability) print("<<< DETECTED ({}) {}% '{}' >>>".format(prediction, percent, label)) except KeyboardInterrupt: pass transform.close() average_time = predictor.avg_time() + transform.avg_time() print("Average processing time: {}".format(average_time)) if results is None: raise Exception("test_keyword_spotter failed to find any predictions!") return tuple(list(results) + [average_time])
def get_wav_features(input_filename, transform, sample_rate, window_size, shift, auto_scale, mixer): """ Transform the given .wav input file into a set of features given the required sample rate window size and shift. The window size is the number of features we need to give the classifier and the shift is the amount by which that window slides as new transformed features are added. """ transform_input_size = transform.input_size channels = 1 # we only do mono audio right now... source = wav_reader.WavReader(sample_rate, channels, auto_scale) source.open(input_filename, transform_input_size) if mixer: mixer.open(source) source = mixer # apply the featurizing transform transform.open(source) source = lazy_apply_transform(transform) # and apply the classifier window frame size try: source = sliding_window_frame(transform, source, window_size, shift, mixer) rows_generated = 0 for row in source: features = np.ravel(row) rows_generated += 1 yield features except Exception as e: print("### error transforming input file {}: {}".format( input_filename, e)) if rows_generated == 0: print( "### no rows generated for input file: {}".format(input_filename))
def RunTest(self, featurizer_model, classifier_model, list_file, dataset, categories, sample_rate, ignore_label): predictor = classifier.AudioClassifier(classifier_model, categories, [ignore_label], THRESHOLD, SMOOTHING) transform = featurizer.AudioTransform(featurizer_model, predictor.input_size) print("Evaluation with transform input size {}, output size {}".format( transform.input_size, transform.output_size)) print( "Evaluation with classifier input size {}, output size {}".format( predictor.input_size, predictor.output_size)) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") if list_file: with open(list_file, "r") as fp: testlist = [e.strip() for e in fp.readlines()] wav_dir = os.path.dirname(list_file) start = time.time() for name in testlist: # bed/28497c5b_nohash_0.wav expected = name.split('/')[0] wav_file = os.path.join(wav_dir, "audio", name) # open the wav file. reader = wav_reader.WavReader(sample_rate) reader.open(wav_file, transform.input_size, None) transform.open(reader) prediction = self.get_prediction(transform, predictor) self.process_prediction(prediction, expected) elif dataset: if type(dataset) is str: ds = np.load(dataset) features = ds['features'] labels = ds['labels'] else: features = dataset.features labels = dataset.label_names index = 0 start = time.time() for f in features: expected = labels[index] reader = FeatureReader(f, predictor.input_size) prediction = self.get_prediction(reader, predictor) self.process_prediction(prediction, expected) index += 1 end = time.time() seconds = end - start print("Test completed in {:.2f} seconds".format(seconds)) print("{} passed, {} failed, pass rate of {:.2f} %".format( self.passed, self.failed, self.rate * 100)) return self.rate
def is_closed(self): return self.wav_reader1 is None if __name__ == "__main__": parser = argparse.ArgumentParser("Test the AudioNoiseMixer class") parser.add_argument("--wav_file", "-w", help=".wav file to process") parser.add_argument("--noise_dir", "-n", help="directory of .wav files containing noise") parser.add_argument("--mix_ratio", "-r", type=float, default=0.1, help="how much noise to add") args = parser.parse_args() noise_files = [] noise_dir = args.noise_dir for f in os.listdir(noise_dir): if os.path.splitext(f)[1] == ".wav": noise_files += [os.path.join(noise_dir, f)] speaker = speaker.Speaker() mixer = AudioNoiseMixer(noise_files, mix_ratio=args.mix_ratio, mix_percent=1) reader = wav_reader.WavReader(16000, 1) reader.open(args.wav_file, 512) mixer.open(reader, speaker) while True: data = mixer.read() if data is None: break print("finished")
help="Audio channels to use", default=1, type=int) arg_parser.add_argument("--buffer_size", help="Read buffer size", default=512, type=int) arg_parser.add_argument("--code", help="Output c-code for sample data", action="store_true") args = arg_parser.parse_args() # First tell the WavReader what sample rate and channels we want the audio converted to reader = wav_reader.WavReader(args.sample_rate, args.channels, auto_scale=False) # Create a speaker object which we will give to the WavReader. The WavReader will pass # the re-sampled audio to the Speaker so you can hear what it sounds like speaker = speaker.Speaker() # open the reader asking for size chunks of audio, converted to floating point between -1 and 1. reader.open(args.filename, args.buffer_size, speaker) code = args.code # pump the reader until it returns None. In a real app you would assign the results of read() to # a variable so you can process the audio chunks returned. while True: buffer = reader.read() if buffer is None:
parser.add_argument("--speaker", help="Output audio to the speaker.", action='store_true') args = parser.parse_args() predictor = classifier.AudioClassifier(args.classifier, args.categories, args.threshold, SMOOTHING) transform = featurizer.AudioTransform(args.featurizer, predictor.input_size) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") # set up inputs and outputs if args.wav_file: output_speaker = None if args.speaker: output_speaker = speaker.Speaker() reader = wav_reader.WavReader(args.sample_rate, CHANNELS) reader.open(args.wav_file, transform.input_size, output_speaker) else: reader = microphone.Microphone(True) reader.open(transform.input_size, args.sample_rate, CHANNELS) print("Please type 'x' and enter to terminate this app...") transform.open(reader) try: while True: feature_data = transform.read() if feature_data is None: break else: prediction, probability, label = predictor.predict(feature_data)
def run_test(self, featurizer_model, classifier_model, list_file, max_tests, dataset, categories, sample_rate, auto_scale, output_file, algorithm="max", window_size=0): """ Run the test using the given input models (featurizer and classifier) which may or may not be compiled. The test set is defined by a list_file or a dataset. The list file lists .wav files which we will featurize using the given featurizer. The dataset contains pre-featurized data as created by make_dataset.py. The categories define the names of the keywords detected by the classifier and the sample_rate defines the audio sample rate in Hertz -- all input audio is resampled at this rate before featurization. """ predictor = classifier.AudioClassifier(classifier_model, categories, THRESHOLD, SMOOTHING) if window_size == 0: window_size = predictor.input_size transform = featurizer.AudioTransform(featurizer_model, window_size) if not self.silent: self.logger.info( "Evaluation with transform input size {}, output size {}". format(transform.input_size, transform.output_size)) self.logger.info( "Evaluation with classifier input size {}, output size {}". format(predictor.input_size, predictor.output_size)) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") results = [] if list_file: with open(list_file, "r") as fp: testlist = [e.strip() for e in fp.readlines()] wav_dir = os.path.dirname(list_file) if max_tests: testlist = np.random.choice(testlist, max_tests, replace=False) start = time.time() for name in testlist: # e.g. bed/28497c5b_nohash_0.wav expected = name.split('/')[0] wav_file = os.path.join(wav_dir, name) # open the wav file. reader = wav_reader.WavReader(sample_rate, 1, auto_scale) reader.open(wav_file, transform.input_size, None) transform.open(reader) prediction, confidence, _, elapsed = self.get_prediction( name, transform, predictor, algorithm) self.process_prediction(name, prediction, expected, confidence) results += [prediction] if self.best_time is None or elapsed < self.best_time: self.best_time = elapsed elif dataset: if type(dataset) is str: ds = np.load(dataset) features = ds['features'] labels = ds['labels'] else: features = dataset.features labels = dataset.label_names index = 0 start = time.time() for f in features: expected = labels[index] reader = FeatureReader(f, predictor.input_size) name = "row " + str(index) prediction, confidence, _, elapsed = self.get_prediction( name, reader, predictor) self.process_prediction(name, prediction, expected, confidence) if self.best_time is None or elapsed < self.best_time: self.best_time = elapsed index += 1 else: raise Exception("Missing list_file and dataset arguments") end = time.time() seconds = end - start self.logger.info("Saving '{}'".format(output_file)) with open(output_file, "w") as f: json.dump(results, f) self.logger.info("Test completed in {:.2f} seconds".format(seconds)) self.logger.info("{} passed, {} failed, pass rate of {:.2f} %".format( self.passed, self.failed, self.rate * 100)) self.logger.info("Best prediction time was {} seconds".format( self.best_time)) return self.rate, self.best_time
arg_parser.add_argument("filename", help="wav file to play ") arg_parser.add_argument("--sample_rate", "-s", help="Audio sample rate to use", default=16000, type=int) arg_parser.add_argument("--channels", "-c", help="Audio channels to use", default=1, type=int) args = arg_parser.parse_args() # First tell the WavReader what sample rate and channels we want the audio converted to reader = wav_reader.WavReader(args.sample_rate, args.channels) # Create a speaker object which we will give to the WavReader. The WavReader will pass # the re-sampled audio to the Speaker so you can hear what it sounds like speaker = speaker.Speaker() # open the reader asking for 256 size chunks of audio, converted to floating point betweeo -1 and 1. reader.open(args.filename, 256, speaker) print("wav file contains sample rate {} and {} channels".format( reader.actual_rate, reader.actual_channels)) # pump the reader until it returns None. In a real app you would assign the results of read() to # a variable so you can process the audio chunks returned. while reader.read() is not None: pass