def start_training(working_dir, pre_training_phase=True): ensures_dir(CHECKPOINTS_SOFTMAX_DIR) ensures_dir(CHECKPOINTS_TRIPLET_DIR) batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] if pre_training_phase: logger.info('Softmax pre-training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if pre_training_checkpoint is not None: initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) logger.info(f'Initial epoch is {initial_epoch}.') logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.') dsm.m.load_weights(pre_training_checkpoint) # latest one. else: initial_epoch = 0 fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch) else: logger.info('Training with the triplet loss.') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.') dsm.m.load_weights(triplet_checkpoint) elif pre_training_checkpoint is not None: logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.') # If `by_name` is True, weights are loaded into layers only if they share the # same name. This is useful for fine-tuning or transfer-learning models where # some of the layers have changed. dsm.m.load_weights(pre_training_checkpoint, by_name=True) dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) fit_model(dsm, working_dir, NUM_FRAMES)
def start_training(working_dir): pre_training_phase=True ensures_dir(CHECKPOINTS_MTL_DIR) ensures_dir(CHECKPOINTS_MTL_DIR) batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] logger.info('Started training.') kc = KerasFormatConverter(working_dir) num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) logger.info(f'categorical_speakers: {kc.categorical_speakers.speaker_ids}') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False, num_speakers_softmax=num_speakers_softmax) base_model = dsm.m x = base_model.output x = Dense(1024, name='shared')(x) y=Dense(1024,name='speaker_task')(x) speaker_out= Dense(num_speakers_softmax, activation='softmax',name='speaker_pred')(y) gender_out= Dense(1, activation='sigmoid',name='gender_pred')(x) model = Model(inputs=base_model.input, outputs=[speaker_out, gender_out]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','binary_crossentropy'], metrics={'speaker_pred': 'accuracy', 'gender_pred': 'binary_accuracy'}) training_checkpoint = load_best_checkpoint(CHECKPOINTS_MTL_DIR) if training_checkpoint is not None: initial_epoch = int(training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) logger.info(f'Initial epoch is {initial_epoch}.') logger.info(f'Loading softmax checkpoint: {training_checkpoint}.') model.load_weights(training_checkpoint) # latest one. else: initial_epoch = 0 fit_model_mtl(model, kc.kx_train, kc.ky_train,kc.kg_train, kc.kx_test, kc.ky_test,kc.kg_test, initial_epoch=initial_epoch)
def test(working_dir, checkpoint_file=None): batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] dsm = DeepSpeakerModel(batch_input_shape) if checkpoint_file is None: checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) if checkpoint_file is not None: logger.info( f'Found checkpoint [{checkpoint_file}]. Loading weights...') dsm.m.load_weights(checkpoint_file, by_name=True) else: logger.info(f'Could not find any checkpoint in {checkpoint_file}.') exit(1) fm, tpr, acc, eer = eval_model(working_dir, model=dsm) logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, ' f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
def main(): select = True try: sys.argv[1] except Exception: select = False print('select', select) working_dir = '/media/philippe/8TB/deep-speaker' # by construction this losses should be much higher than the normal losses. # we select batches this way. batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] print('Testing with the triplet losses.') dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: print(f'Loading triplet checkpoint: {triplet_checkpoint}.') dsm.m.load_weights(triplet_checkpoint) elif pre_training_checkpoint is not None: print(f'Loading pre-training checkpoint: {pre_training_checkpoint}.') # If `by_name` is True, weights are loaded into layers only if they share the # same name. This is useful for fine-tuning or transfer-learning models where # some of the layers have changed. dsm.m.load_weights(pre_training_checkpoint, by_name=True) dsm.m.compile(optimizer='adam', loss=deep_speaker_loss) kc = KerasFormatConverter(working_dir) if select: print('TripletBatcherSelectHardNegatives()') batcher = TripletBatcherSelectHardNegatives(kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, dsm) else: print('TripletBatcher()') batcher = TripletBatcher(kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test) batch_size = BATCH_SIZE losses = [] while True: _bx, _by = batcher.get_batch(batch_size, is_test=False) losses.append( dsm.m.evaluate(_bx, _by, verbose=0, batch_size=BATCH_SIZE)) print(np.mean(losses))
def main(): model = DeepSpeakerModel() model.m.load_weights( '/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/checkpoints-triplets/ResCNN_triplet_training_checkpoint_265.h5', by_name=True) # mfcc_001 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5.wav', SAMPLE_RATE), NUM_FRAMES) # mfcc_002 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5-2.wav', SAMPLE_RATE), NUM_FRAMES) # predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) # predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0)) # mfcc_003 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/6-M-45/6.wav', SAMPLE_RATE), NUM_FRAMES) # predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0)) # print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002)) # print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003)) features = [] labels = [] for x in range(10): mfcc1, mfcc2, label = load_data() feature1 = model.m.predict(np.expand_dims(mfcc1, axis=0)) feature2 = model.m.predict(np.expand_dims(mfcc2, axis=0)) cost = batch_cosine_similarity(feature1, feature2) # print(cost) features.append(cost[0]) labels.append(label) # print(cost.shape) # load 2 file (random) + label, predict roi dua vao SVM, # dung den triplet # features = feature1 + feature2 from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC features = np.array(features) labels = np.array(labels) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(features, labels) svm_pickle = open('svm.pkl', 'wb') pickle.dump(clf, svm_pickle) svm_pickle.close()
def main2(): batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False) dsm.m.compile(optimizer='adam', loss=deep_speaker_loss) dsm.m.load_weights('/Users/premy/deep-speaker/ResCNN_checkpoint_102.h5', by_name=True) dsm.m.summary() batcher = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker', max_length=NUM_FRAMES, model=dsm) bs = 18 print( np.mean([ dsm.m.evaluate(*batcher.get_batch_train(batch_size=bs), batch_size=bs, verbose=0) for _ in range(100) ])) print( np.mean([ dsm.m.evaluate(*batcher.get_batch_test(batch_size=bs), batch_size=bs, verbose=0) for _ in range(100) ])) print( np.mean([ dsm.m.evaluate(*batcher.get_random_batch(batch_size=bs, is_test=False), batch_size=bs, verbose=0) for _ in range(100) ])) print( np.mean([ dsm.m.evaluate(*batcher.get_random_batch(batch_size=bs, is_test=True), batch_size=bs, verbose=0) for _ in range(100) ]))
def play(): text = None out_file = r"D:/Projects/Internship/samtest/file_out.wav" rootdir = os.path.join(os.getcwd(), 'samples') attendance_file_path = os.path.join(os.getcwd(), 'Attendance_data\out.csv') def print_data(info): with open(r'\Attendance_data\out.csv', 'rb') as handle: unserialized_data = csv.reader(handle) print(info, unserialized_data) # if data doesn't exist if not os.path.exists(attendance_file_path) and not os.path.isfile( attendance_file_path): if not os.path.exists('Attendance_data'): os.makedirs('Attendance_data') d = { 'Date': [], 'EmpName': [], 'EmpID': [], 'In': [], 'Out': [], 'Duration': [], 'Attendance': [] } df = pd.DataFrame(data=d) print('\nCreating New Attendance DataFrame : ') print(df) df.to_csv(r'Attendance_data\out.csv', index=False) #print_data('Data is created : \n') # compression_opts = dict(method='zip', # archive_name='out.csv') # df.to_csv('out.zip', index=False, # compression=compression_opts) names = [] for subdir, dirs, files in os.walk(rootdir): for dir_name in dirs: names.append(dir_name) class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKCYAN = '\033[96m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' def pyttsx3(text): # obtain voice property voices = engine.getProperty('voices') # voice id 1 is for female and 0 for male engine.setProperty('voice', voices[1].id) # convert to audio and play engine.say(text) engine.runAndWait() print( bcolors.OKGREEN + "\n\nWelcome to Attendance System based on Speaker Recognition.\n\nRules are simple, say your name and roll num and the attendance will be updated.\n" ) pyttsx3( "Welcome to Attendance System based on Speaker Recognition. Rules are simple, say your name and roll num and the attendance will be updated. Warning: Don't try to give proxy" ) print(bcolors.WARNING + "Warning: Don't try to give proxy" + bcolors.ENDC + "\n") audio = pyaudio.PyAudio() FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 44100 CHUNK = 1024 RECORD_SECONDS = 12 # start Recording stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) r = sr.Recognizer() print("Speak something...\n") pyttsx3( "The recording has started, please say Hello ewarn,along with your name and employee ID and if you are signing in or out" ) frames = [] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) pyttsx3( "The recording has completed, and now your information will be updated, please be patient and if you feel there is an error kindly contact the adminstrator" ) print("Recording saved\n") # stop Recording stream.stop_stream() stream.close() audio.terminate() waveFile = wave.open(out_file, 'wb') waveFile.setnchannels(CHANNELS) waveFile.setsampwidth(audio.get_sample_size(FORMAT)) waveFile.setframerate(RATE) waveFile.writeframes(b''.join(frames)) waveFile.close() with sr.AudioFile(out_file) as source: #print("Say something!") audio = r.record(source) # read the entire audio file try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` #print("Did you say? " + r.recognize_google(audio)) text = r.recognize_google(audio) except sr.UnknownValueError: print("eWarn could not understand audio") if "hello" not in text: print("Trigger word missing, Please try again") pyttsx3("Trigger word missing, Please try again") exit(0) # Reproducible results. np.random.seed(123) random.seed(123) # Define the model here. model = DeepSpeakerModel() # Load the checkpoint. model.m.load_weights('Model.h5', by_name=True) mfcc_005 = sample_from_mfcc(read_mfcc(out_file, SAMPLE_RATE), NUM_FRAMES) # Call the model to get the embeddings of shape (1, 512) for each file. predict_005 = model.m.predict(np.expand_dims(mfcc_005, axis=0)) #names = [] select = dict() from statistics import mean for subdir, dirs, files in os.walk(rootdir): for dir_name in dirs: #names.append(dir_name) #print('person dir : ', dir_name) #print('person dir files : \n', os.listdir(os.path.join(rootdir, dir_name))) select_list = list() for file_name in os.listdir(os.path.join(rootdir, dir_name)): #print(file_name) #print('person dir files seperate : \n', os.path.join(rootdir, dir_name, file_name)) mfcc_001 = sample_from_mfcc( read_mfcc(os.path.join(rootdir, dir_name, file_name), SAMPLE_RATE), NUM_FRAMES) predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) select_list.append( batch_cosine_similarity(predict_005, predict_001)[0]) #print(select_list) select[dir_name] = mean(select_list) select_list.clear() #print('Names : ', names) print('\nPredcitions :', select) Keymax = max(select, key=select.get) if (select[Keymax]) >= 0.5: print('The Speaker is: ', Keymax.split('+')[0]) pyttsx3('The Speaker is ' + str(Keymax.split('+')[0])) time_in = None time_out = None #'EmpName': [], 'EmpID':[], 'In':[], 'Out':[], 'Duration':[], 'Attendance':[]} if text.lower().split().count('in') == 1: #print('text has in', text) time_in = datetime.datetime.now() print("Current time for in:-", time_in) df_in = pd.read_csv(attendance_file_path, parse_dates=['Date']) temp_in = {'Date': datetime.datetime.date(time_in), 'EmpName': Keymax.split('+')[0], 'EmpID': Keymax.split('+')[1], \ 'In': time_in, 'Out': 'zero', 'Duration': 'zero', 'Attendance': 'zero'} temp_df = pd.DataFrame(temp_in, index=[0]) #print("temp_in", temp_in) #print("temp_df", temp_df) if not df_in.empty: print('DataFrame is not empty!') #df_in.append(temp_df, ignore_index = True) print('\n\nIN Before Update\n', df_in) df3 = pd.concat([df_in, temp_df], ignore_index=True) df3.reset_index() df3.to_csv(r'Attendance_data\out.csv', index=False) print('\n\ndf3\n', df3.tail(5)) if df_in.empty: print('DataFrame is empty!') #df_new = pd.DataFrame(temp_in) temp_df.to_csv(r'Attendance_data\out.csv', index=False) print('After IN Update', temp_df) exit(0) if text.lower().split().count('out') == 1: #print('Text has out') df_out = pd.read_csv(attendance_file_path, parse_dates=['Date']) #print(df_out) time_out = datetime.datetime.now() print("Current time for out:-", time_out) in1 = df_out['In'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] #print(in1) df_out['Out'].loc[(df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int( Keymax.split('+')[1]))] = time_out out1 = df_out['Out'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] #print(out1) delta = pd.to_datetime(out1) - pd.to_datetime(in1) #print(delta) df_out['Duration'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] = delta day1 = df_out['Attendance'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now() - datetime.timedelta(days=1)))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] #print(day1.empty) if day1.empty: df_out['Attendance'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int(Keymax.split('+')[1]))] = 1 else: df_out['Attendance'].loc[ (df_out['Date'] == pd.to_datetime( datetime.datetime.date(datetime.datetime.now()))) & (df_out['EmpName'] == Keymax.split('+')[0]) & (df_out['EmpID'] == int( Keymax.split('+')[1]))] = int(day1[0]) + 1 df_out.to_csv(r'Attendance_data\out.csv', index=False) print(df_out.tail(5)) exit(0) else: print("Don't try to give proxy") pyttsx3("Don't try to give proxy") exit(0)
import numpy as np import random from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES from conv_models import DeepSpeakerModel from test import batch_cosine_similarity np.random.seed(123) random.seed(123) model = DeepSpeakerModel() model.m.load_weights( '/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True) mfcc_001 = sample_from_mfcc( read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE), NUM_FRAMES) mfcc_002 = sample_from_mfcc( read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE), NUM_FRAMES) predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0)) mfcc_003 = sample_from_mfcc( read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES) predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0)) print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002))
import random import numpy as np from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES from conv_models import DeepSpeakerModel from test import batch_cosine_similarity from batcher import KerasFormatConverter kc = KerasFormatConverter('./') # Define the model here. model = DeepSpeakerModel(include_softmax=False, include_classifier=True, num_speakers_softmax=len( kc.categorical_speakers.speaker_ids)) # Load the checkpoint. model.m.load_weights('checkpoints-classify/ResCNN_checkpoint_1.h5') mfcc_001 = sample_from_mfcc( read_mfcc('samples/train/0/0/0-0-Recording (12).m4a', SAMPLE_RATE), NUM_FRAMES) predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0)) print(np.argmax(predict_001[0]))
import numpy as np from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES from conv_models import DeepSpeakerModel from test import batch_cosine_similarity from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt # Reproducible results. np.random.seed(123) random.seed(123) # Define the model here. dsm = DeepSpeakerModel(include_softmax=False) base_model = dsm.m x = base_model.output x = Dense(1024, name='shared')(x) x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln1')(x) model = Model(base_model.input,x) # Load the checkpoint. model.load_weights(CHECKPOINT_PATH, by_name=True) for i in speaker_u: temp_speaker=[] for j in range(len(f)): if speakers[j]==i: temp_speaker.append(f[j]) for k in range(len(temp_speaker)): if k==0: