def test(file_name): demo_file = file_name # demo=data.load_wav_file(data.path + demo_file) demo = data.load_wav_file("data/test/" + demo_file) result = model.predict([demo]) conf = numpy.amax(result) * 100 result = data.one_hot_to_item(result, speakers) print("predicted speaker for %s : result = %s confidence = %.2f" % (demo_file, result, conf))
def validateWav(demo_file): demoData = speech_data.load_wav_file( speech_data.snore_train_path + demo_file, 0) result = model.predict([demoData]) if (result[0][1] > 0.6): rc = 1 print(demo_file + ":" + str((result[0][0])) + " / " + str((result[0][1])) + ": ISSNORE") else: print(demo_file + ":" + str((result[0][0])) + " / " + str((result[0][1]))) rc = 0 return rc
def handle_speaker_rec_test_intent(self, message): speakers = data.get_speakers() number_classes = len(speakers) #print("speakers",speakers) #batch=data.wave_batch_generator(batch_size=1000, source=data.Source.DIGIT_WAVES, target=data.Target.speaker) #X,Y=next(batch) # Classification #tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) #model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100) CWD_PATH = os.path.dirname(__file__) path_to_model = os.path.join(CWD_PATH, 'model', 'model.tfl') model.load(path_to_model) demo_file = "8_Vicki_260.wav" #demo_file = "8_Bruce_260.wav" demo = data.load_wav_file(data.path + demo_file) result = model.predict([demo]) result = data.one_hot_to_item(result, speakers) if result == "Vicki": self.speak("I am confident I'm speaking to %s" % (result)) # ~ 97% correct else: self.speak("I'm sorry I don't recognize your voice")
number_classes = 10 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #net = tflearn.fully_connected(net, 64) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=5, show_metric=True, snapshot_step=100) # Overfitting okay for now print("================= TEST 결과 ================") path = "data/number_test/" files = os.listdir(path) print("loaded Test batch of %d files" % len(files)) for wav in files: demo_file = path + wav demo = speech_data.load_wav_file(demo_file) result = model.predict([demo]) result = numpy.argmax(result) print("== predicted digit for %s :result = %d " % (demo_file, result))
number_classes = len(speakers) ##print("speakers",speakers) WORD_WAVs = "spoken_words" batch = data.wave_batch_generator(batch_size=1000, source=WORD_WAVs, target=data.Target.speaker) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) ##model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100) model.load("tflearn.dnn.model") # demo_file = "8_Vicki_260.wav" demo_file = "8_Bruce_260.wav" demo = data.load_wav_file(data.path + demo_file) result = model.predict([demo]) result = data.one_hot_to_item(result, speakers) model.save("tflearn.lstm.model") print("predicted speaker for %s : result = %s " % (demo_file, result))
# Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 batch = speech_data.wave_batch_generator(10000, target=speech_data.Target.digits) X, Y = next(batch) number_classes = 10 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=3, show_metric=True, snapshot_step=100) # Overfitting okay for now demo_file = "5_Vicki_260.wav" demo = speech_data.load_wav_file(speech_data.path + demo_file) result = model.predict([demo]) result = numpy.argmax(result) print("predicted digit for %s : result = %d " % (demo_file, result))
import numpy # Simple spoken digit recognition demo, with 98% accuracy in under a minute # Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 batch=speech_data.wave_batch_generator(10000,target=speech_data.Target.digits) X,Y=next(batch) number_classes=10 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y,n_epoch=3,show_metric=True,snapshot_step=100) # Overfitting okay for now demo_file = "5_Vicki_260.wav" demo=speech_data.load_wav_file(speech_data.path + demo_file) result=model.predict([demo]) result=numpy.argmax(result) print("predicted digit for %s : result = %d "%(demo_file,result))
# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000 # 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki' speakers = data.get_speakers() number_classes=len(speakers) print("speakers",speakers) WORD_WAVs="spoken_words" batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker) X,Y=next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100) # demo_file = "8_Vicki_260.wav" demo_file = "8_Bruce_260.wav" demo=data.load_wav_file(data.path + demo_file) result=model.predict([demo]) result=data.one_hot_to_item(result,speakers) print("predicted speaker for %s : result = %s "%(demo_file,result))
optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(regression) model.fit(X, Y, n_epoch=200, show_metric=True, snapshot_step=100) model.save("model.tfl") # model.load("model.tfl") m2 = tflearn.DNN(fc2, session=model.session) baza_path = "data/baza_glosow/" baza_path_vect = "data/baza_glosow/vect/" # Feature extraction phase for filename in os.listdir(baza_path): if filename.endswith(".wav"): demo = data.load_wav_file(baza_path + filename) vect = m2.predict([demo]) sq_vect = np.squeeze(vect) np.savetxt(baza_path_vect + filename + ".npz", sq_vect, delimiter=',') # Verification phase cnt = 0 sum = 0 for filename in os.listdir(baza_path): if filename.endswith(".wav"): out = cluster_vector.cluster(baza_path_vect, os.listdir(baza_path_vect), filename[:-4]) print(filename) print(out) correct_cnt = 0 for it in out:
# Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 if __name__ == '__main__': batch = wave_batch_generator(10000, target=Target.digits) X, Y = next(batch) number_classes = 10 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=3, show_metric=True, snapshot_step=100) # Overfitting okay for now demo_file = "5_Vicki_260.wav" demo = load_wav_file(path + demo_file) result = model.predict([demo]) result = numpy.argmax(result) print("predicted digit for %s : result = %d " % (demo_file, result))
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y,n_epoch=10,show_metric=True,snapshot_step=100) model.save("model.tfm") # model.load("model.tfm") baza_path = "data/spoken_numbers_pcm/tt/" cnt = 0 sum = 0 for filename in os.listdir(baza_path): if filename.endswith(".wav"): demo=speech_data.load_wav_file(baza_path + filename) result=model.predict([demo]) result=numpy.argmax(result) print("predicted speaker for %s : result = %d "%(filename, result)) out = cluster_vector.cluster(baza_path_vect, os.listdir(baza_path_vect), filename[:-4]) correct_cnt = 0 for it in out: if it==''.join(filter(str.islower, filename[:-4])): correct_cnt += 1 sum += (correct_cnt-1)/2 cnt +=1 print(sum)