def make_model(number_classes): batch = data.wave_batch_generator(batch_size=1000, target=data.Target.speaker) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 3848]) #Two wave chunks net = tflearn.fully_connected(net, 128) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 16) net = tflearn.dropout(net, 0.8) net = tflearn.fully_connected(net, 128) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) return model
def train(number_classes): model = ml.make_model(number_classes) batch = data.wave_batch_generator(batch_size=1000, target=data.Target.speaker) X, Y = next(batch) model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100) model.save('classifier')
def main(): batch = speech_data.wave_batch_generator(10000, target=speech_data.Target.digits) X, Y = next(batch) Y = [numpy.hstack([y, numpy.array([0, 0, 0, 0, 0, 0])]) for y in Y] # Y = map(lambda a: , Y) print(type(Y)) # print (np.hstack([Y[0], np.array([0, 0, 0, 0, 0, 0])])) number_classes = 16 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64, name='f1') net = tflearn.dropout(net, 0.5, name='dp') net = tflearn.fully_connected(net, number_classes, activation='softmax', name='f2') net = tflearn.regression(net, optimizer='sgd', loss='categorical_crossentropy') model = tflearn.DNN(net) model.load('pre-trained/model.tflearn.sgd_trained') # Overfitting okay for now totalTime = 0 totalAcc = 0 numTimes = 100 for i in range(numTimes): t = time.time() result = model.predict(X) print("-------------") result = numpy.array([numpy.argmax(r) for r in result]) answers = numpy.array([numpy.argmax(answer) for answer in Y]) print(i, ">>>", (result == answers).sum() / float(len(answers)), "time: ", time.time() - t) totalAcc = totalAcc + (result == answers).sum() / float(len(answers)) totalTime = totalTime + time.time() - t print("Avg. Acc. = ", totalAcc / numTimes) print("Avg. time = ", totalTime / numTimes)
#!/usr/bin/env python #!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import pyaudio import speech_data import numpy import os import tensorflow as tf # Simple spoken digit recognition demo, with 98% accuracy in under a minute # Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 batch = speech_data.wave_batch_generator(10000, target=speech_data.Target.digits) X, Y = next(batch) number_classes = 10 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #net = tflearn.fully_connected(net, 64) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
import tflearn import pyaudio import speech_data as data # Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample ) # | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000 # 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki' speakers = data.get_speakers() number_classes = len(speakers) ##print("speakers",speakers) WORD_WAVs = "spoken_words" batch = data.wave_batch_generator(batch_size=1000, source=WORD_WAVs, target=data.Target.speaker) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net)
# | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000 # 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki' import tensorflow as tf print("You are using tensorflow version " + tf.__version__) #+" tflearn version "+ tflearn.version) if tf.__version__ >= '0.12' and os.name == 'nt': print("sorry, tflearn is not ported to tensorflow 0.12 on windows yet!(?)") quit() # why? works on Mac? speakers = data.get_speakers() number_classes = len(speakers) print("speakers", speakers) batch = data.wave_batch_generator(batch_size=1000, source=data.Source.DIGIT_WAVES, target=data.Target.speaker) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net)
#!/usr/local/bin/python #!/usr/bin/env PYTHONIOENCODING="utf-8" python import os import tflearn import speech_data as data import tensorflow as tf import numpy speakers = data.get_speakers() number_classes = len(speakers) print("speakers", speakers) batch = data.wave_batch_generator(batch_size=1000, target=data.Target.speaker) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 3848]) #Two wave chunks net = tflearn.fully_connected(net, 128) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 16) net = tflearn.dropout(net, 0.8) net = tflearn.fully_connected(net, 128) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax')
#!/usr/bin/env python #!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn #import pyaudio import speech_data import numpy import sys load = speech_data.wave_batch_generator(2404) X, Y = next(load) number_classes = 10 # Digits tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) # nombre de coeur allouer et memoire gpu allouer net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.0001, loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, validation_set=0.2, n_epoch=200, show_metric=True,
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import pyaudio import speech_data import numpy # Simple spoken digit recognition demo, with 98% accuracy in under a minute # Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 batch=speech_data.wave_batch_generator(10000,target=speech_data.Target.digits) X,Y=next(batch) number_classes=10 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y,n_epoch=3,show_metric=True,snapshot_step=100) # Overfitting okay for now demo_file = "5_Vicki_260.wav" demo=speech_data.load_wav_file(speech_data.path + demo_file)
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import pyaudio import speech_data as data # Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample ) # | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000 # 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki' speakers = data.get_speakers() number_classes=len(speakers) print("speakers",speakers) WORD_WAVs="spoken_words" batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker) X,Y=next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)
# Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 # 98% Accuracy on training set in just a minute # audio = pyaudio.PyAudio() # # format=pyaudio.paFloat32 # format=pyaudio.paInt8 # # format=audio.get_format_from_width(f.getsampwidth()) # # out_stream = audio.open( format=format,channels = f.getnchannels(), rate=f.getframerate(), output= True) # out_stream = audio.open( format=format,channels = 1, rate=48000, output= True) # out_stream.start_stream() # def play_pcm(data): # out_stream.write(data) batch = speech_data.wave_batch_generator(1000) X, Y = next(batch) # Classification # x = tflearn.input_data(shape=[None, 8192]) # net = tflearn.fully_connected(x, 64) # net = tflearn.dropout(net, 0.5) # net = tflearn.fully_connected(net, 10, activation='softmax') # net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') # y = net.placeholder # classifier = tflearn.DNN(net) def model(net): # type: (layer.net) -> None # net.inputnet_data(shape=[None, 10])
import tflearn from speech_data import wave_batch_generator, Target, load_wav_file, path import numpy # Simple spoken digit recognition demo, with 98% accuracy in under a minute # Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 if __name__ == '__main__': batch = wave_batch_generator(10000, target=Target.digits) X, Y = next(batch) number_classes = 10 # Digits # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=3, show_metric=True, snapshot_step=100) # Overfitting okay for now demo_file = "5_Vicki_260.wav"
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import pyaudio import speech_data as data # Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample ) # | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000 # 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki' speakers = data.get_speakers() number_classes = len(speakers) print("speakers", speakers) target = data.Target.speaker batch = data.wave_batch_generator(batch_size=1000, target=target) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)
import speech_data import numpy as np from constants import POSITIVE_PATH, NEGATIVE_PATH learning_rate = 0.03 training_iters = 10 # steps batch_size = 2 width = 1 height = 8192 # (max) length of utterance classes = 2 # digits positive_batch = word_batch = speech_data.wave_batch_generator( batch_size, POSITIVE_PATH, [1.0, 0.0]) negative_batch = word_batch = speech_data.wave_batch_generator( batch_size, NEGATIVE_PATH, [0.0, 1]) #positive_batch = np.reshape(positive_batch, (1, 10, 8192)) #negative_batch = np.reshape(negative_batch, (1, 10, 8192)) # Network building '''net = tflearn.input_data(shape=[None, 8192]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') ''' net = tflearn.input_data([None, width, height]) net = tflearn.lstm(net, 128 * 4, dropout=0.2)
# Training Step: 544 | total loss: 0.15866 # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 # 98% Accuracy on training set in just a minute # audio = pyaudio.PyAudio() # # format=pyaudio.paFloat32 # format=pyaudio.paInt8 # # format=audio.get_format_from_width(f.getsampwidth()) # # out_stream = audio.open( format=format,channels = f.getnchannels(), rate=f.getframerate(), output= True) # out_stream = audio.open( format=format,channels = 1, rate=48000, output= True) # out_stream.start_stream() # def play_pcm(data): # out_stream.write(data) batch=speech_data.wave_batch_generator(1000) X,Y=next(batch) # Classification # x = tflearn.input_data(shape=[None, 8192]) # net = tflearn.fully_connected(x, 64) # net = tflearn.dropout(net, 0.5) # net = tflearn.fully_connected(net, 10, activation='softmax') # net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') # y = net.placeholder # classifier = tflearn.DNN(net) def model(net): # type: (layer.net) -> None # net.inputnet_data(shape=[None, 10]) net.fully_connected( 64)
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import os import speech_data as data # training and testing data sets train_data = '/home/cc/Data/small-clean-train/' test_data = '/home/cc/Data/small-clean-test/' # grab the speakers from the training directory speakers = data.get_speakers(train_data) number_classes = len(speakers) # create the MFCC arrays from the data for training batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker,speakers=speakers) X,Y=next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) # seems like a higher dropout rate works better -- why is this?? net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)