def segment(data, seg_location, length): os.chdir(data) files = os.listdir(data) speakers = speech_data.get_speakers(data) waves = [] num = {} for s in speakers: num[s] = 0 c = 0 for f in files: # grab all wave files in list waves.append(audio.from_wav(f)) c = c + 1 os.chdir(seg_location) for f, w in zip( files, waves): # need to segment the data into one second intervals begin = 0 end = 1 while (end * length) < int(w.duration_seconds): segment = w[begin * 1000 * length:end * 1000 * length] segment.export( speech_data.speaker(f) + '_' + str(num[speech_data.speaker(f)]) + '.wav', 'wav') begin = begin + length end = end + length num[speech_data.speaker(f)] = num[speech_data.speaker(f)] + 1
def main(): speakers = data.get_speakers() number_classes = len(speakers) print("speakers", speakers) model = make_model(number_classes) model.load('classifier') stream = audio.Stream() while True: raw_input('press enter to record!!!') buff = stream.record(1.5) sample = audio.stream_to_ints(buff) test(model, speakers, sample)
def main(): speakers = data.get_speakers() number_classes = len(speakers) print("speakers", speakers) # train(number_classes) # return model = ml.make_model(number_classes) model.load('classifier') stream = audio.Stream() while True: input('press enter to record!!!') buff = stream.record(1.5) sample = audio.stream_to_ints(buff) label, conf = ml.predict(model, speakers, sample) print("predicted : result = %s confidence = %.2f" % (label, conf))
def handle_speaker_rec_test_intent(self, message): speakers = data.get_speakers() number_classes = len(speakers) #print("speakers",speakers) #batch=data.wave_batch_generator(batch_size=1000, source=data.Source.DIGIT_WAVES, target=data.Target.speaker) #X,Y=next(batch) # Classification #tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) #model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100) CWD_PATH = os.path.dirname(__file__) path_to_model = os.path.join(CWD_PATH, 'model', 'model.tfl') model.load(path_to_model) demo_file = "8_Vicki_260.wav" #demo_file = "8_Bruce_260.wav" demo = data.load_wav_file(data.path + demo_file) result = model.predict([demo]) result = data.one_hot_to_item(result, speakers) if result == "Vicki": self.speak("I am confident I'm speaking to %s" % (result)) # ~ 97% correct else: self.speak("I'm sorry I don't recognize your voice")
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import pyaudio import speech_data as data # Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample ) # | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000 # 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki' speakers = data.get_speakers() number_classes = len(speakers) ##print("speakers",speakers) WORD_WAVs = "spoken_words" batch = data.wave_batch_generator(batch_size=1000, source=WORD_WAVs, target=data.Target.speaker) X, Y = next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
import os import sys import librosa import tflearn import wave import pickle import tensorflow as tf import librosa.display import IPython.display import numpy as np import speech_data from pydub import AudioSegment as audio # now put all of the mfccs into an array data = '/home/cc/working/data/devclean_2_seg/' speakers = speech_data.get_speakers(data) audio_files = os.listdir(data) mfccs = [] Y = [] for f in audio_files: Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers)) y, sr = librosa.load(data + f) mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)) net = tflearn.input_data(shape=[None, 13, 44]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 32) net = tflearn.fully_connected(net, len(speakers), activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
import os import librosa import tflearn import speech_data from pydub import AudioSegment as audio speakers = speech_data.get_speakers('/home/cc/working/data/devclean_seg/') net = tflearn.input_data(shape=[None, 13, 44]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, len(speakers), activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net, tensorboard_verbose=3) model.load('/home/cc/working/models/devclean/devclean_train.tflearn') os.chdir('/home/cc/working/data/devclean_test/') test = [] for f1 in os.listdir(os.getcwd()): y, sr = librosa.load(f1) test.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)) result = model.predict(test) c = 0 for f, r in zip(os.listdir(os.getcwd()), result): res = speech_data.one_hot_to_item(r, speakers) if res in f: c = c + 1 print('correct: %s ; total: %s' % (str(c), str(len(test))))
import os import sys import librosa import tflearn import wave import tensorflow as tf import librosa.display import IPython.display import numpy as np import speech_data from pydub import AudioSegment as audio # now put all of the mfccs into an array data = '/home/cc/working/data/devclean_seg/' os.chdir(data) speakers = speech_data.get_speakers(os.getcwd()) audio_files = os.listdir(os.getcwd()) mfccs = [] Y = [] for f in audio_files: Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers)) y, sr = librosa.load(f) mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)) net = tflearn.input_data(shape=[None, 13, 44]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.6) net = tflearn.fully_connected(net, len(speakers), activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net)
import numpy as np # load constants - training directory, testing directory training_seg = '/home/cc/Data/Dev-Clean-Train-Two/' testing = '/home/cc/Data/Dev-Clean-Test-Two' # size of fully connected layers n = sys.argv[1] #l = sys.argv[2] m = 18 d = 0.8 # calculate the mfcc matrices for training from the segmented data #X = [] #Y = [] speakers = speech_data.get_speakers(training_seg) #for f in os.listdir(training_seg): # Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers)) # y, sr = librosa.load(training_seg + f) # X.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=int(m))) #pickle.dump(X, open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_X.p', 'wb')) #pickle.dump(Y, open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_Y.p', 'wb')) X = pickle.load( open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_X.p', 'rb')) Y = pickle.load( open('/home/cc/Data/pickle_files/mfcc_len/train' + str(m) + '_Y.p', 'rb')) # define the network and the model for training tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)
import wave import pickle import speech_data import segment_data import tensorflow as tf import librosa.display import numpy as np # load constants - training directory, testing directory training = '/home/cc/Data/train/' testing = '/home/cc/Data/test/' # calculate the mfcc matrices for training from the segmented data X = [] Y = [] speakers = speech_data.get_speakers(training) for f in os.listdir(training): Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers)) y, sr = librosa.load(training + f) mfcc = np.asarray(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)) X.append(mfcc) # input size for fully connected layers layer_size = int(sys.argv[1]) dropout = float(sys.argv[2]) # define the network and the model for training tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) # for just mfcc net = tflearn.input_data(shape=[None, 20, 87])
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import random random.seed(5) import tflearn import os import sys import librosa import speech_data as data import numpy as np import pickle test_data = '/home/edresson/Pti-embbending/Encoder-MFCC/Automatizado/Bases/Segments-5s/Validacao/Base1/X/' train_data = '/home/edresson/Pti-embbending/Encoder-MFCC/Automatizado/Bases/Segments-5s/Treino/Base1/X-2/' working = '' # grab the speakers from the training directory speakers = data.get_speakers(train_data) number_classes = len(speakers) #print(number_classes,speakers) # create the MFCC arrays from the data for training audio_files = os.listdir(working + train_data) X = [] Y = [] try: with open('rna-treino_X-5s.txt', 'rb') as f: X = pickle.load(f) with open('rna-treino_Y-5s.txt', 'rb') as f: Y = pickle.load(f)
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import pyaudio import speech_data as data # Simple speaker recognition demo, with 99% accuracy in under a minute ( on digits sample ) # | Adam | epoch: 030 | loss: 0.05330 - acc: 0.9966 -- iter: 0000/1000 # 'predicted speaker for 9_Vicki_260 : result = ', 'Vicki' speakers = data.get_speakers() number_classes=len(speakers) print("speakers",speakers) WORD_WAVs="spoken_words" batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker) X,Y=next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)
#!/usr/bin/env PYTHONIOENCODING="utf-8" python import tflearn import os import speech_data as data # training and testing data sets train_data = '/home/cc/Data/small-clean-train/' test_data = '/home/cc/Data/small-clean-test/' # grab the speakers from the training directory speakers = data.get_speakers(train_data) number_classes = len(speakers) # create the MFCC arrays from the data for training batch=data.wave_batch_generator(batch_size=1000,source=WORD_WAVs,target=data.Target.speaker,speakers=speakers) X,Y=next(batch) # Classification tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 8192]) #Two wave chunks net = tflearn.fully_connected(net, 64) # seems like a higher dropout rate works better -- why is this?? net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net) model.fit(X, Y, n_epoch=100, show_metric=True, snapshot_step=100)