def split(filename='g1238-20181214-081712-1544750232.37681.wav'): sr, samples = wavfile.read(filename=filename, mmap=True) #print(len(samples)) plt.plot(samples) asource = ADSFactory.ads(filename=filename, record=False) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=400 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 10 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=100, max_length=500, max_continuous_silence=50) asource.open() tokens = tokenizer.tokenize(asource) stack = [] sum = [] for i, t in enumerate(tokens): #print("Token [{0}] starts at {1} and ends at {2}".format(i+1, t[1], t[2])) stack.append([t[1] * 80, t[2] * 80]) sum.append((t[2] * 80 - t[1] * 80) / 8000) wavfile.write('token_' + str(i) + '.wav', sr, samples[t[1] * 80:t[2] * 80]) #write to file asource.close() print(sum) return stack
def transcribe_audio(self, stereo_path, channels_to_process): if not os.path.isfile(stereo_path): raise Exception("Audio file does not exist.") data = self.split_to_mono(stereo_path) a_leg = data['a_leg'] b_leg = data['b_leg'] data['a_leg'] = None data['b_leg'] = None validator = AudioEnergyValidator(sample_width=data['frame_width'], energy_threshold=45) trimmer = StreamTokenizer(validator, min_length=self.min_segment_length, max_length=self.max_segment_length, max_continuous_silence=self.max_continuous_silence, mode=StreamTokenizer.DROP_TAILING_SILENCE) segments = [] if channels_to_process in ['A', 'AB']: a_source = ADSFactory.ads(audio_source=a_leg, record=True, block_size=data['frame_rate'] / self.divisor) a_source.open() trimmer.tokenize(a_source, callback=lambda data, start, end: segments.append(("A", data, start, end))) if channels_to_process in ['B', 'AB']: b_source = ADSFactory.ads(audio_source=b_leg, record=True, block_size=data['frame_rate'] / self.divisor) b_source.open() trimmer.tokenize(b_source, callback=lambda data, start, end: segments.append(("B", data, start, end))) segments = sorted(segments, key=lambda x: x[3]) self.batch(segments, data['duration'], data['frame_rate'], data['frame_width'], data['nchannels'])
def make_auditok_detector(sample_rate=100): bytes_per_frame = 2 frames_per_window = FRAME_RATE // sample_rate validator = AudioEnergyValidator(sample_width=bytes_per_frame, energy_threshold=50) tokenizer = StreamTokenizer(validator=validator, min_length=0.2 * sample_rate, max_length=int(5 * sample_rate), max_continuous_silence=0.25 * sample_rate) def _detect(asegment): asource = BufferAudioSource(data_buffer=asegment, sampling_rate=FRAME_RATE, sample_width=bytes_per_frame, channels=1) ads = ADSFactory.ads(audio_source=asource, block_dur=1. / sample_rate) ads.open() tokens = tokenizer.tokenize(ads) length = (len(asegment) // bytes_per_frame + frames_per_window - 1) // frames_per_window media_bstring = np.zeros(length + 1, dtype=int) for token in tokens: media_bstring[token[1]] += 1 media_bstring[token[2] + 1] -= 1 return np.cumsum(media_bstring)[:-1] > 0 return _detect
def getSplitAudioDurationListBetweenSilence(fileName,eachAudioLen,silencePeriod,energyThreshold=55): try: # We set the `record` argument to True so that we can rewind the source asource = ADSFactory.ads(filename=fileName, record=False) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=energyThreshold) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=400 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=400, max_length=eachAudioLen*100, max_continuous_silence=silencePeriod*100) asource.open() tokens = tokenizer.tokenize(asource) # Play detected regions back #player = player_for(asource) # Rewind and read the whole signal #asource.rewind() #original_signal = [] #while True: # w = asource.read() # if w is None: # break # original_signal.append(w) #original_signal = b''.join(original_signal) #player.play(original_signal) #print("\n ** playing detected regions...\n") #for i,t in enumerate(tokens): # print("Token [{0}] starts at {1} and ends at {2}".format(i+1, t[1], t[2])) #data = b''.join(t[0]) #player.play(data) #assert len(tokens) == 8 asource.close() #player.stop() except KeyboardInterrupt: #player.stop() asource.close() #sys.exit(0) except Exception as e: sys.stderr.write(str(e) + "\n") #sys.exit(1) return tokens
def _make_auditok_detector( sample_rate: int, frame_rate: int, non_speech_label: float ) -> Callable[[bytes], np.ndarray]: try: from auditok import ( BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer, ) except ImportError as e: logger.error( """Error: auditok not installed! Consider installing it with `pip install auditok`. Note that auditok is GPLv3 licensed, which means that successfully importing it at runtime creates a derivative work that is GPLv3 licensed. For personal use this is fine, but note that any commercial use that relies on auditok must be open source as per the GPLv3!* *Not legal advice. Consult with a lawyer. """ ) raise e bytes_per_frame = 2 frames_per_window = frame_rate // sample_rate validator = AudioEnergyValidator(sample_width=bytes_per_frame, energy_threshold=50) tokenizer = StreamTokenizer( validator=validator, min_length=0.2 * sample_rate, max_length=int(5 * sample_rate), max_continuous_silence=0.25 * sample_rate, ) def _detect(asegment: bytes) -> np.ndarray: asource = BufferAudioSource( data_buffer=asegment, sampling_rate=frame_rate, sample_width=bytes_per_frame, channels=1, ) ads = ADSFactory.ads(audio_source=asource, block_dur=1.0 / sample_rate) ads.open() tokens = tokenizer.tokenize(ads) length = ( len(asegment) // bytes_per_frame + frames_per_window - 1 ) // frames_per_window media_bstring = np.zeros(length + 1) for token in tokens: media_bstring[token[1]] = 1.0 media_bstring[token[2] + 1] = non_speech_label - 1.0 return np.clip(np.cumsum(media_bstring)[:-1], 0.0, 1.0) return _detect
def read_split_dir(file): f = sf.SoundFile(file) #duration of file in seconds duration = len(f) / f.samplerate if duration <= 4: print(file, 'untouched') else: #Get original filename name = os.path.splitext(file)[0] tempsound = AudioSegment.from_wav(file) tempsound = tempsound.set_channels(1) tempsound.export('0wavtmp_' + file, format="wav") tmpfile = '0wavtmp_' + file # We set the `record` argument to True so that we can rewind the source asource = ADSFactory.ads(filename=tmpfile, record=True) validator = AudioEnergyValidator( sample_width=asource.get_sample_width(), energy_threshold=50) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=4000 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=500, max_length=4000, max_continuous_silence=100) asource.open() tokens = tokenizer.tokenize(asource) for index, t in enumerate(tokens): #print("Token starts at {0} and ends at {1}".format(t[1], t[2])) newAudio = AudioSegment.from_wav(file) newAudio = newAudio[t[1]:t[2]] chunk_name = "{}_clip{}.wav".format(name, index) print("Generating", chunk_name) newAudio.export( chunk_name, format="wav") #Exports to a wav file in the current path. #Remove the temporary file we made earlier os.remove(tmpfile) #Remove the original file to avoid confusion os.remove(file)
def __init__(self): self.asource = ADSFactory.ads(record=True, max_time=4) self.validator = AudioEnergyValidator(sample_width=2, energy_threshold=50) self.tokenizer = StreamTokenizer(validator=self.validator, min_length=20, max_length=1000, max_continuous_silence=30) self.player = player_for(self.asource) self.model = self.load_cnn('../model/final_cnn_model.json', '../model/weights_final_cnn.h5') self.model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
def calibrate(self): ''' This method calibrates the MinMaxScaler, self.scaler, by capturing 10 seconds of audio and applying MinMaxScaler fit method. See sklearn.preprocessing.MinMaxScaler for details. This is redundant, scaling is not necessary. ''' a = raw_input( "Calibrate normalisation, press return then make noises from your mouth hole." ) if self.audioPath == None: asource = ADSFactory.ads(sampling_rate=self.sr, max_time=10) else: asource = ADSFactory.ads(filename=self.audioPath, sampling_rate=self.sr, max_time=10) validator = AudioEnergyValidator( sample_width=asource.get_sample_width(), energy_threshold=self.energy) tokenizer = StreamTokenizer(validator=validator, min_length=self.min_len, max_length=self.max_len, max_continuous_silence=self.max_con_si) def calib_callback(data, start, end): audio = np.fromstring(data[0], dtype=np.int8) self.scaler.fit_transform(np.swapaxes(np.asarray([audio]), 0, 1)) print "Audio sample found {0}--{1}".format(start, end) asource.open() tokenizer.tokenize(asource, callback=calib_callback) print "Scaler paramaters found: min: {0} max: {1}".format( self.scaler.data_min_, self.scaler.data_max_) print "calibration done" self.mini = self.scaler.data_min_ self.maxi = self.scaler.data_max_
def __init__(self, _useGui): # parametros de áudio max_length = 1000000 max_interval = 12000 max_continuous_silence = 500 min_length = 150 self.sample_rate = 48000 self.asource = ADSFactory.ads(record=True, max_time=max_length, sampling_rate=self.sample_rate) self.sample_width = self.asource.get_sample_width() self.channels = self.asource.get_channels() # START VALIDATOR self.validator = AudioEnergyValidator( sample_width=self.sample_width, energy_threshold=energy_threshold) self.tokenizer = StreamTokenizer( validator=self.validator, min_length=min_length, max_length=max_length, max_continuous_silence=max_continuous_silence) self.audio_folder = 'recordings/' + '{:%Y-%m-%d_%H-%M-%S}'.format( datetime.datetime.now()) + '/' if not os.path.exists(os.path.dirname(self.audio_folder)): try: os.makedirs(os.path.dirname(self.audio_folder)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raiseRec os.chmod('recordings', 0o777) os.chmod(self.audio_folder, 0o777) self.MODE = 'ECHO' self.useGui = _useGui if self.useGui: root = Tk() self.display = GUI(root, True) self.display.display_image()
def configure(self, rf): # Setting up rpc port self.portsList["rpc"] = yarp.Port() self.portsList["rpc"].open("/sentence_tokenizer/rpc:i") self.attach(self.portsList["rpc"]) self.portsList["audio_out"] = yarp.BufferedPortBottle() self.portsList["audio_out"].open("/sentence_tokenizer/audio:o") # Setting up hotword detection self.hotword_detector = snowboydecoder.HotwordDetector(self.hotword_model, sensitivity=self.hotword_sensitivity) # Setting up audio tokenizer to split sentences self.audio_source = ADSFactory.ads(record=True, max_time=self.tok_record_duration, block_dur=self.tok_window) self.tok_validator = AudioEnergyValidator(sample_width=self.audio_source.get_sample_width(), energy_threshold=self.tok_energy_threshold) self.tokenizer_mode = StreamTokenizer.DROP_TRAILING_SILENCE self.tokenizer = StreamTokenizer(validator=self.tok_validator, min_length=self.tok_min_len, max_length=self.tok_max_len, max_continuous_silence=self.tok_max_silence_duration, mode=self.tokenizer_mode) if self.echo_enabled: self.echo_thread = threading.Thread(target=self.replayAudio) self.echo_thread.start() if self.hotword_enabled: print("Waiting for hotword to start interaction") # self.hotword_detector.start(detected_callback=self.detected_callback, # interrupt_check=self.interrupt_callback, # sleep_time=self.hotword_loop_time) print("Hotword detected. Starting tokenizer thread") else: print "Starting tokenizer thread" self.asr = sr.Recognizer() with open('google_credentials.json', 'r') as credentials: self.google_credentials = credentials.read() return True
''' # record = True so that we'll be able to rewind the source. # max_time = 10: read 10 seconds from the microphone asource = ADSFactory.ads(record=True) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) tokenizer = StreamTokenizer(validator=validator, min_length=20, max_length=250, max_continuous_silence=30) player = player_for(asource) asource.open() tokenizer.tokenize(asource, callback=echo) ''' asource = ADSFactory.ads(sampling_rate=16000, sample_width=2, channels=1, frames_per_buffer=128, record=False, block_dur=0.01) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) tokenizer = StreamTokenizer(validator=validator, min_length=100, max_continuous_silence=500) asource.open() tokenizer.tokenize(asource, callback=echo)
# set up audio source asource = ADSFactory.ads(record=True, max_time=min_length, sampling_rate=sample_rate) #check os system and set sample rate 48000 for Linux (Raspberry Pi) _os = platform.system() if (_os == 'Darwin') or (_os == 'Windows'): # macOs sample_rate = asource.get_sampling_rate() # get sample width and channels from ads factory sample_width = asource.get_sample_width() channels = asource.get_channels() # START VALIDATOR validator = AudioEnergyValidator(sample_width=sample_width, energy_threshold=energy_threshold) tokenizer = StreamTokenizer( validator=validator, min_length=min_length, max_length=max_length, max_continuous_silence=max_continuous_silence) # # LOAD PYAUDIO p = pyaudio.PyAudio() # start classe memoria _memoria = memoria.Memoria() # gui vars if GUI: root = Tk()
def find_voice_segments(audio_file, music_time_list): segments = [] formats = {1: numpy.int8, 2: numpy.int16, 4: numpy.int32} #[Fs_cr, x_cr] = aIO.readAudioFile(input_audio_audio_file) #[Fs_ce, x_ce] = aIO.readAudioFile(callee_audio_file) #segments = aS.silenceRemoval(x_cr, Fs_cr, 0.010, 0.010, smoothWindow=3,Weight=0.3,plot=False) #print(segments) #callee_segments = aS.silenceRemoval(x_ce, Fs_ce, 0.010, 0.010, smoothWindow=5,Weight=0.3,plot=False) #print(callee_segments) test_source = ADSFactory.ads(filename=audio_file, record=False) test_source.open() i = 0 max_value = 0.0 a = numpy.empty([], dtype=numpy.float64) b = numpy.empty([], dtype=numpy.float64) while True: frame = test_source.read() if frame is None: break signal = numpy.array(numpy.frombuffer( frame, dtype=formats[test_source.get_sample_width()]), dtype=numpy.float64) energy = float(numpy.dot(signal, signal)) / len(signal) max_value = max(max_value, energy) i += 1 b = numpy.append(b, [energy]) #diff = max_value - numpy.mean(b) #print(10. * numpy.log10(0.3*diff)) log_max = 10. * numpy.log10(max_value) log_mean = 10. * numpy.log10(numpy.mean(b)) tmp = log_max - log_mean threshold = log_mean + 0.4 * tmp #print(threshold) test_source.close() asource = ADSFactory.ads(filename=audio_file, record=False) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=threshold) tokenizer = StreamTokenizer(validator=validator, min_length=300, max_length=99999999, max_continuous_silence=300) player = player_for(asource) asource.open() tokens = tokenizer.tokenize(asource) for i, t in enumerate(tokens): segment_begin = t[1] * 10 segment_end = t[2] * 10 if len(music_time_list) > 0: for item in music_time_list: # if segment end includes music begin if segment_end > item[0]: #include segment before music segments.append([segment_begin, item[0]]) #save stamps for incluing segment after music segment_begin = item[1] # remove music segment from list # to not use it in further music_time_list.remove(item) segments.append([segment_begin, segment_end]) asource.close() return segments
def extractEvents(path, patientID): yname = os.path.basename(path) yname = yname[:len(yname) - 4] dest_path = '/home/pi/recordings/' + patientID + '/' + yname + '/' if not os.path.exists(dest_path): os.makedirs(dest_path) fsoriginal, y = wavfile.read(path) # read audio file try: r, c = np.shape(y) if c > 1: y = np.delete(y, 1, axis=1) # print("audio file shape: ", numpy.shape(y)) except: print(' ') wavfile.write('/home/pi/coughanalysis_ann/sample.wav', data=y, rate=44100) asource = ADSFactory.ads( filename='/home/pi/coughanalysis_ann/sample.wav', record=True) validator = AudioEnergyValidator( sample_width=asource.get_sample_width(), energy_threshold=65) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=4000 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms # For a sampling rate of 16KHz (16000 samples per second), we have 160 samples for 10 ms. tokenizer = StreamTokenizer(validator=validator, min_length=10, max_length=1000, max_continuous_silence=40) asource.open() tokens = tokenizer.tokenize(asource) # Play detected regions back # player = player_for(asource) # Rewind and read the whole signal asource.rewind() original_signal = [] while True: w = asource.read() if w is None: break original_signal.append(w) original_signal = ''.join(original_signal) # print("Playing the original file...") # player.play(original_signal) # print("playing detected regions...") count = 0 for t in tokens: # print("Token starts at {0} and ends at {1}".format(t[1], t[2])) data = ''.join(t[0]) # player.play(data) fp = wave.open(dest_path + yname + str(count) + '.wav', "w") fp.setnchannels(asource.get_channels()) fp.setsampwidth(asource.get_sample_width()) fp.setframerate(asource.get_sampling_rate()) fp.writeframes(data) fp.close() count += 1 return dest_path
try: r, c = numpy.shape(y) if c > 1: y = numpy.delete(y, 1, axis=1) # print("audio file shape: ", numpy.shape(y)) except: print(' ') wavfile.write('sample.wav', data=y, rate=44100) asource = ADSFactory.ads( filename="/home/baswarajmamidgi/salcit/coughanalysis_ann/sample.wav", record=True) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=65) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=4000 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms #For a sampling rate of 16KHz (16000 samples per second), we have 160 samples for 10 ms. tokenizer = StreamTokenizer(validator=validator, min_length=10, max_length=1000, max_continuous_silence=40) asource.open() tokens = tokenizer.tokenize(asource)
def runAuditok(self): ''' This method captures sound from the audio source specified in self.audioPath if self.audioPath is None, the built in microphone is used. ''' #a = raw_input("waiting for start") if self.audioPath == None: self.asource = ADSFactory.ads(sampling_rate=self.sr) else: self.asource = ADSFactory.ads(filename=self.audioPath, sampling_rate=self.sr) self.validator = AudioEnergyValidator( sample_width=self.asource.get_sample_width(), energy_threshold=self.energy) self.tokenizer = StreamTokenizer( validator=self.validator, min_length=self.min_len, max_length=self.max_len, max_continuous_silence=self.max_con_si) self.player = player_for(self.asource) self.prev_data = np.zeros([1]) def audio_callback(data, start, end): if not np.array_equal(data, self.prev_data): self.sendTrigger() # send notice that audio has been detected print("Acoustic activity at: {0}--{1}".format(start, end)) stamp = (start, end, self.chunk_count) if self.record: self.saveAudio(data) copied = [] for x in data: np_data = np.frombuffer(x, dtype=np.uint8) #print np_data copied.append(np_data) data_rs = self.reshapeAudio(np.asarray(copied)) self.sendAudio(data_rs, stamp) self.prev_data = data if self.PLAYBACK: print "playing audio" self.playback(data_rs) self.chunk_count += 1 self.asource.open() self.sendTrigger( ) # send notice that the audio has started to be processed self.tokenizer.tokenize(self.asource, callback=audio_callback) sys.exit(0)