def save_to_file(data, filename): with open(filename, 'w') as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Analyze input wave-file and save detected speech interval to json file.' ) # parser.add_argument('inputfile', metavar='INPUTWAVE', # help='the full path to input wave file') parser.add_argument( 'outputfile', metavar='OUTPUTFILE', help= 'the full path to output json file to save detected speech intervals') args = parser.parse_args() # Create a speech detection object and read in the audio data v = VoiceActivityDetector('FILE_NAME.wav') # Detect speech in the given data raw_detection = v.detect_speech() # Convert the detected speech signals to a readable format speech_labels = v.convert_windows_to_readible_labels(raw_detection) # Save the converted and detected speech to a file save_to_file(speech_labels, args.outputfile)
x.export(FILE_FOLDER + FILE_NAME + '-mono.wav', format="wav") waveFile = wave.open(FILE_FOLDER + FILE_NAME + '-mono.wav', 'r') # Getting some variables from file signal = waveFile.readframes(-1) signal = np.fromstring(signal, 'Int16') frameRate = waveFile.getframerate() Time = np.linspace(0, len(signal) / frameRate, num=len(signal)) totalFrames = waveFile.getnframes() wavFileLengthSec = totalFrames / frameRate # VOICE ACTIVITY DETECTION (VAD) vad = VoiceActivityDetector(fullFileName) raw_detection = vad.detect_speech() speech_labels, starters, enders = vad.convert_windows_to_readible_labels( raw_detection) # list of beginnings of voice detected segments (in frames) starterFrames = [int(i) for i in starters] # list of endings of voice detected segments (in frames) enderFrames = [int(i) for i in enders] # if the last voice detected segment isn't closed, use the last frame as end of this segment if len(starterFrames) > len(enderFrames): enderFrames.append(totalFrames) # list of beginnings of voice detected segments (in milliseconds) starterMs = [int((i / frameRate) * 1000) for i in starterFrames] # list of endings of voice detected segments (in milliseconds) enderMs = [int((i / frameRate) * 1000) for i in enderFrames]
def run(self): #global my_mutex global numRecordings global audioNum global processNum megaArray = [] currDay = 1 fileChar = "A" wavFile = "" #while processNum < numRecordings: while goingFlag == True or processNum < audioNum: while processNum >= audioNum: y = 0 #print processNum try: if processNum > 0: subprocess.check_output(['rm', wavFile]) #jsonFile = "/home/pi/results" + `processNum` + ".json" wavFile = dir_path + "/talking" + ` processNum ` + ".wav" print "Processing " + wavFile #foFile = "fo" + `processNum` + ".fo" #pmFile = "pm" + `processNum` + ".pm" timeEnd = q.get() tempDay = currDay currDay = qSoundJson.get() if tempDay != currDay: megaArray = [] soundName = dir_path + "/data/day" + str( currDay) + "/" + fileChar + "/sound" + str( fileNum) + ".json" if fileChar == "A": fileChar = "B" else: fileChar = "A" print timeEnd #print subprocess.check_output(['python', '/home/pi/detectVoiceInWav.py', wavFile, jsonFile, str(timeEnd)]) v = VoiceActivityDetector(wavFile) ampData = v._calculate_amplitude(v.data) if np.percentile(ampData, 75) < 200: print "AUTO" speech_labels = [] speech_dict = {} speech_dict['percent_time_w_speech'] = 0.0 speech_dict['time_started'] = round(timeEnd, 3) speech_dict['length_of_recording'] = round( lengthRecording, 3) speech_dict['mean_amp'] = round( float(float(sum(ampData)) / len(ampData)), 3) speech_dict['med_amp'] = round(np.median(ampData), 3) speech_dict['25_amp'] = round(np.percentile(ampData, 25), 3) speech_dict['75_amp'] = round(np.percentile(ampData, 75), 3) speech_dict['max_amp'] = round(np.amax(ampData), 3) speech_dict['processed'] = 0 speech_labels.append(speech_dict) print "AUTO DONE" else: print "MANUAL" raw_detection = v.detect_speech() #print raw_detection speech_labels = v.convert_windows_to_readible_labels( raw_detection, str(timeEnd)) print "MANUAL DONE" megaArray.append(speech_labels) save_to_file(megaArray, soundName) #print subprocess.check_output(['./reaper/REAPER/build/reaper', '-i', wavFile, '-f', foFile, '-p', pmFile, '-a']) processNum = processNum + 1 except subprocess.CalledProcessError as e: logging.exception("message") subprocess.check_output(['rm', wavFile]) sense.clear() with open("processWav.txt", "w") as text_file: text_file.write("Total wavs processed: {}".format(str(processNum))) with open("createWav.txt", "w") as t2: t2.write("Total wavs supposed to be processed: {}".format( str(audioNum))) currTime = time.time() currTime = currTime - timeStart with open("timeTaken.txt", "w") as t3: t3.write("Time taken to process wavs: {}".format(str(currTime)))
def main(): datadir = './dataset/train/audio/' labels = { 'yes': 0, 'no': 1, 'up': 2, 'down': 3, 'left': 4, 'right': 5, 'on': 6, 'off': 7, 'go': 8, 'stop': 9 } labels_list = np.array([ 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'go', 'stop' ]) train_in = [] train_out = [] test_in = [] test_out = [] removal_thr = 0 vadcount = 0 for word, label in labels.items(): print(word) dirpath = datadir + word files = os.listdir(dirpath) np.random.shuffle(files) for i, fn in enumerate(files): filepath = os.path.join(dirpath, fn) sample_rate, samples = wavfile.read(filepath) #print(filepath) # run voice activity detector v = VoiceActivityDetector(filepath) detected_windows = v.detect_speech() speechtime = v.convert_windows_to_readible_labels(detected_windows) silence_free = np.array([]) if len(speechtime) != 0: for segment in speechtime: n0 = int(segment['speech_begin'] * sample_rate) n1 = int(segment['speech_end'] * sample_rate) silence_free = np.append(silence_free, samples[n0:n1]) #print(len(silence_free)) # !!! IMPORTANT check the quality of silence removel # bad ones will crush the spectrogram step if len(silence_free) >= 3200: vadcount += 1 freqs, times, spectrogram = log_specgram(silence_free, sample_rate, window_size=10, step_size=5) else: freqs, times, spectrogram = log_specgram(samples, sample_rate, window_size=10, step_size=5) #plt.pcolormesh(freqs, times, spectrogram, cmap='coolwarm') #plt.show() # average spectrogram shape is (97.3,161) vvvvv: x and y are reversed in cv2 spectrogram = cv2.resize(spectrogram, dsize=(161, 100), interpolation=cv2.INTER_CUBIC) spectrogram = spectrogram.reshape(100, 161, 1) if i < len(files) / 5: test_in.append(spectrogram) test_out.append(one_hot_encode(label)) else: train_in.append(spectrogram) train_out.append(one_hot_encode(label)) train_in = np.array(train_in, dtype=np.float32) train_out = np.array(train_out, dtype=np.int32) test_in = np.array(test_in, dtype=np.float32) test_out = np.array(test_out, dtype=np.int32) print(train_in.shape) print(train_out.shape) print(test_in.shape) print(test_out.shape) print(vadcount, ' audios were processed with VAD.') save_filename = './mini_speech_data2.npz' np.savez(save_filename, train_in=train_in, train_out=train_out, test_in=test_in, test_out=test_out, labels=labels_list)
from vad import VoiceActivityDetector import argparse import json def save_to_file(data, filename): with open(filename, 'w') as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Analyze input wave-file and save detected speech interval to json file.') parser.add_argument('inputfile', metavar='INPUTWAVE', help='the full path to input wave file') parser.add_argument('outputfile', metavar='OUTPUTFILE', help='the full path to output json file to save detected speech intervals') args = parser.parse_args() v = VoiceActivityDetector(args.inputfile) raw_detection = v.detect_speech() speech_labels = v.convert_windows_to_readible_labels(raw_detection) save_to_file(speech_labels, args.outputfile)