Example #1
0
def start_training(working_dir, pre_training_phase=True):
    ensures_dir(CHECKPOINTS_SOFTMAX_DIR)
    ensures_dir(CHECKPOINTS_TRIPLET_DIR)
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    if pre_training_phase:
        logger.info('Softmax pre-training.')
        kc = KerasFormatConverter(working_dir)
        num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
        dsm.m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if pre_training_checkpoint is not None:
            initial_epoch = int(pre_training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
            logger.info(f'Initial epoch is {initial_epoch}.')
            logger.info(f'Loading softmax checkpoint: {pre_training_checkpoint}.')
            dsm.m.load_weights(pre_training_checkpoint)  # latest one.
        else:
            initial_epoch = 0
        fit_model_softmax(dsm, kc.kx_train, kc.ky_train, kc.kx_test, kc.ky_test, initial_epoch=initial_epoch)
    else:
        logger.info('Training with the triplet loss.')
        dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
        triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
        pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
        if triplet_checkpoint is not None:
            logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
            dsm.m.load_weights(triplet_checkpoint)
        elif pre_training_checkpoint is not None:
            logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
            # If `by_name` is True, weights are loaded into layers only if they share the
            # same name. This is useful for fine-tuning or transfer-learning models where
            # some of the layers have changed.
            dsm.m.load_weights(pre_training_checkpoint, by_name=True)
        dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
        fit_model(dsm, working_dir, NUM_FRAMES)
def start_training(working_dir):
    pre_training_phase=True
    ensures_dir(CHECKPOINTS_MTL_DIR)
    ensures_dir(CHECKPOINTS_MTL_DIR)
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    logger.info('Started training.')
    kc = KerasFormatConverter(working_dir)
 
    num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
    logger.info(f'categorical_speakers: {kc.categorical_speakers.speaker_ids}')
    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False, num_speakers_softmax=num_speakers_softmax)
    base_model = dsm.m
    x = base_model.output
    x = Dense(1024, name='shared')(x)
    y=Dense(1024,name='speaker_task')(x)
    speaker_out= Dense(num_speakers_softmax, activation='softmax',name='speaker_pred')(y)
    gender_out= Dense(1, activation='sigmoid',name='gender_pred')(x)
    model = Model(inputs=base_model.input, outputs=[speaker_out, gender_out])
    
    model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy','binary_crossentropy'], metrics={'speaker_pred': 'accuracy', 'gender_pred': 'binary_accuracy'})
    training_checkpoint = load_best_checkpoint(CHECKPOINTS_MTL_DIR)
    if training_checkpoint is not None:
        initial_epoch = int(training_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
        logger.info(f'Initial epoch is {initial_epoch}.')
        logger.info(f'Loading softmax checkpoint: {training_checkpoint}.')
        model.load_weights(training_checkpoint)  # latest one.
    else:
        initial_epoch = 0
    fit_model_mtl(model, kc.kx_train, kc.ky_train,kc.kg_train, kc.kx_test, kc.ky_test,kc.kg_test, initial_epoch=initial_epoch)
Example #3
0
def test(working_dir, checkpoint_file=None):
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    dsm = DeepSpeakerModel(batch_input_shape)
    if checkpoint_file is None:
        checkpoint_file = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
    if checkpoint_file is not None:
        logger.info(
            f'Found checkpoint [{checkpoint_file}]. Loading weights...')
        dsm.m.load_weights(checkpoint_file, by_name=True)
    else:
        logger.info(f'Could not find any checkpoint in {checkpoint_file}.')
        exit(1)

    fm, tpr, acc, eer = eval_model(working_dir, model=dsm)
    logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
                f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
Example #4
0
def main():
    select = True
    try:
        sys.argv[1]
    except Exception:
        select = False
    print('select', select)

    working_dir = '/media/philippe/8TB/deep-speaker'
    # by construction this  losses should be much higher than the normal losses.
    # we select batches this way.
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    print('Testing with the triplet losses.')
    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
    triplet_checkpoint = load_best_checkpoint(CHECKPOINTS_TRIPLET_DIR)
    pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
    if triplet_checkpoint is not None:
        print(f'Loading triplet checkpoint: {triplet_checkpoint}.')
        dsm.m.load_weights(triplet_checkpoint)
    elif pre_training_checkpoint is not None:
        print(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
        # If `by_name` is True, weights are loaded into layers only if they share the
        # same name. This is useful for fine-tuning or transfer-learning models where
        # some of the layers have changed.
        dsm.m.load_weights(pre_training_checkpoint, by_name=True)
    dsm.m.compile(optimizer='adam', loss=deep_speaker_loss)
    kc = KerasFormatConverter(working_dir)
    if select:
        print('TripletBatcherSelectHardNegatives()')
        batcher = TripletBatcherSelectHardNegatives(kc.kx_train, kc.ky_train,
                                                    kc.kx_test, kc.ky_test,
                                                    dsm)
    else:
        print('TripletBatcher()')
        batcher = TripletBatcher(kc.kx_train, kc.ky_train, kc.kx_test,
                                 kc.ky_test)
    batch_size = BATCH_SIZE
    losses = []
    while True:
        _bx, _by = batcher.get_batch(batch_size, is_test=False)
        losses.append(
            dsm.m.evaluate(_bx, _by, verbose=0, batch_size=BATCH_SIZE))
        print(np.mean(losses))
Example #5
0
def main():

    model = DeepSpeakerModel()
    model.m.load_weights(
        '/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/checkpoints-triplets/ResCNN_triplet_training_checkpoint_265.h5',
        by_name=True)

    # mfcc_001 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5.wav', SAMPLE_RATE), NUM_FRAMES)
    # mfcc_002 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5-2.wav', SAMPLE_RATE), NUM_FRAMES)

    # predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
    # predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))

    # mfcc_003 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/6-M-45/6.wav', SAMPLE_RATE), NUM_FRAMES)
    # predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))

    # print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002))
    # print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003))
    features = []
    labels = []
    for x in range(10):
        mfcc1, mfcc2, label = load_data()
        feature1 = model.m.predict(np.expand_dims(mfcc1, axis=0))
        feature2 = model.m.predict(np.expand_dims(mfcc2, axis=0))
        cost = batch_cosine_similarity(feature1, feature2)
        # print(cost)
        features.append(cost[0])
        labels.append(label)
    # print(cost.shape)
    #  load 2 file (random) + label, predict roi dua vao SVM,
    # dung den triplet
    # features = feature1 + feature2
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVC
    features = np.array(features)
    labels = np.array(labels)
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    clf.fit(features, labels)
    svm_pickle = open('svm.pkl', 'wb')
    pickle.dump(clf, svm_pickle)
    svm_pickle.close()
def main2():
    batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
    dsm = DeepSpeakerModel(batch_input_shape, include_softmax=False)
    dsm.m.compile(optimizer='adam', loss=deep_speaker_loss)
    dsm.m.load_weights('/Users/premy/deep-speaker/ResCNN_checkpoint_102.h5',
                       by_name=True)
    dsm.m.summary()
    batcher = LazyTripletBatcher(working_dir='/Users/premy/deep-speaker',
                                 max_length=NUM_FRAMES,
                                 model=dsm)
    bs = 18

    print(
        np.mean([
            dsm.m.evaluate(*batcher.get_batch_train(batch_size=bs),
                           batch_size=bs,
                           verbose=0) for _ in range(100)
        ]))
    print(
        np.mean([
            dsm.m.evaluate(*batcher.get_batch_test(batch_size=bs),
                           batch_size=bs,
                           verbose=0) for _ in range(100)
        ]))
    print(
        np.mean([
            dsm.m.evaluate(*batcher.get_random_batch(batch_size=bs,
                                                     is_test=False),
                           batch_size=bs,
                           verbose=0) for _ in range(100)
        ]))
    print(
        np.mean([
            dsm.m.evaluate(*batcher.get_random_batch(batch_size=bs,
                                                     is_test=True),
                           batch_size=bs,
                           verbose=0) for _ in range(100)
        ]))
Example #7
0
def play():

    text = None
    out_file = r"D:/Projects/Internship/samtest/file_out.wav"
    rootdir = os.path.join(os.getcwd(), 'samples')
    attendance_file_path = os.path.join(os.getcwd(), 'Attendance_data\out.csv')

    def print_data(info):
        with open(r'\Attendance_data\out.csv', 'rb') as handle:
            unserialized_data = csv.reader(handle)
            print(info, unserialized_data)

    # if data doesn't exist
    if not os.path.exists(attendance_file_path) and not os.path.isfile(
            attendance_file_path):
        if not os.path.exists('Attendance_data'):
            os.makedirs('Attendance_data')
            d = {
                'Date': [],
                'EmpName': [],
                'EmpID': [],
                'In': [],
                'Out': [],
                'Duration': [],
                'Attendance': []
            }
            df = pd.DataFrame(data=d)
            print('\nCreating New Attendance DataFrame : ')
            print(df)
            df.to_csv(r'Attendance_data\out.csv', index=False)
        #print_data('Data is created : \n')

    # compression_opts = dict(method='zip',
    #                         archive_name='out.csv')
    # df.to_csv('out.zip', index=False,
    #           compression=compression_opts)

    names = []

    for subdir, dirs, files in os.walk(rootdir):
        for dir_name in dirs:
            names.append(dir_name)

    class bcolors:
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKCYAN = '\033[96m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'

    def pyttsx3(text):
        # obtain voice property
        voices = engine.getProperty('voices')
        # voice id 1 is for female and 0 for male
        engine.setProperty('voice', voices[1].id)
        # convert to audio and play
        engine.say(text)
        engine.runAndWait()

    print(
        bcolors.OKGREEN +
        "\n\nWelcome to Attendance System based on Speaker Recognition.\n\nRules are simple, say your name and roll num and the attendance will be updated.\n"
    )
    pyttsx3(
        "Welcome to Attendance System based on Speaker Recognition. Rules are simple, say your name and roll num and the attendance will be updated. Warning: Don't try to give proxy"
    )
    print(bcolors.WARNING + "Warning: Don't try to give proxy" + bcolors.ENDC +
          "\n")
    audio = pyaudio.PyAudio()

    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 12
    # start Recording
    stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)
    r = sr.Recognizer()
    print("Speak something...\n")
    pyttsx3(
        "The recording has started, please say Hello ewarn,along with your name and employee ID and if you are signing in or out"
    )

    frames = []
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    pyttsx3(
        "The recording has completed, and now your information will be updated, please be patient and if you feel there is an error kindly contact the adminstrator"
    )
    print("Recording saved\n")
    # stop Recording
    stream.stop_stream()
    stream.close()
    audio.terminate()

    waveFile = wave.open(out_file, 'wb')
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b''.join(frames))
    waveFile.close()

    with sr.AudioFile(out_file) as source:
        #print("Say something!")
        audio = r.record(source)  # read the entire audio file
    try:
        # for testing purposes, we're just using the default API key
        # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
        # instead of `r.recognize_google(audio)`
        #print("Did you say? " + r.recognize_google(audio))
        text = r.recognize_google(audio)
    except sr.UnknownValueError:
        print("eWarn could not understand audio")

    if "hello" not in text:
        print("Trigger word missing, Please try again")
        pyttsx3("Trigger word missing, Please try again")
        exit(0)

    # Reproducible results.
    np.random.seed(123)
    random.seed(123)

    # Define the model here.
    model = DeepSpeakerModel()

    # Load the checkpoint.
    model.m.load_weights('Model.h5', by_name=True)

    mfcc_005 = sample_from_mfcc(read_mfcc(out_file, SAMPLE_RATE), NUM_FRAMES)

    # Call the model to get the embeddings of shape (1, 512) for each file.
    predict_005 = model.m.predict(np.expand_dims(mfcc_005, axis=0))

    #names = []
    select = dict()

    from statistics import mean

    for subdir, dirs, files in os.walk(rootdir):
        for dir_name in dirs:
            #names.append(dir_name)
            #print('person dir : ', dir_name)
            #print('person dir files : \n', os.listdir(os.path.join(rootdir, dir_name)))
            select_list = list()
            for file_name in os.listdir(os.path.join(rootdir, dir_name)):
                #print(file_name)
                #print('person dir files seperate : \n', os.path.join(rootdir, dir_name, file_name))
                mfcc_001 = sample_from_mfcc(
                    read_mfcc(os.path.join(rootdir, dir_name, file_name),
                              SAMPLE_RATE), NUM_FRAMES)
                predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))

                select_list.append(
                    batch_cosine_similarity(predict_005, predict_001)[0])

            #print(select_list)
            select[dir_name] = mean(select_list)
            select_list.clear()

    #print('Names : ', names)
    print('\nPredcitions :', select)
    Keymax = max(select, key=select.get)

    if (select[Keymax]) >= 0.5:
        print('The Speaker is: ', Keymax.split('+')[0])
        pyttsx3('The Speaker is ' + str(Keymax.split('+')[0]))
        time_in = None
        time_out = None

        #'EmpName': [], 'EmpID':[], 'In':[], 'Out':[], 'Duration':[], 'Attendance':[]}
        if text.lower().split().count('in') == 1:
            #print('text has in', text)
            time_in = datetime.datetime.now()
            print("Current time for in:-", time_in)

            df_in = pd.read_csv(attendance_file_path, parse_dates=['Date'])
            temp_in = {'Date': datetime.datetime.date(time_in), 'EmpName': Keymax.split('+')[0], 'EmpID': Keymax.split('+')[1], \
          'In': time_in, 'Out': 'zero', 'Duration': 'zero', 'Attendance': 'zero'}
            temp_df = pd.DataFrame(temp_in, index=[0])
            #print("temp_in", temp_in)
            #print("temp_df", temp_df)
            if not df_in.empty:
                print('DataFrame is not empty!')
                #df_in.append(temp_df, ignore_index = True)
                print('\n\nIN Before Update\n', df_in)
                df3 = pd.concat([df_in, temp_df], ignore_index=True)
                df3.reset_index()
                df3.to_csv(r'Attendance_data\out.csv', index=False)
                print('\n\ndf3\n', df3.tail(5))
            if df_in.empty:
                print('DataFrame is empty!')
                #df_new = pd.DataFrame(temp_in)
                temp_df.to_csv(r'Attendance_data\out.csv', index=False)
                print('After IN Update', temp_df)
                exit(0)

        if text.lower().split().count('out') == 1:
            #print('Text has out')
            df_out = pd.read_csv(attendance_file_path, parse_dates=['Date'])
            #print(df_out)
            time_out = datetime.datetime.now()
            print("Current time for out:-", time_out)
            in1 = df_out['In'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now())))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))]
            #print(in1)
            df_out['Out'].loc[(df_out['Date'] == pd.to_datetime(
                datetime.datetime.date(datetime.datetime.now())))
                              & (df_out['EmpName'] == Keymax.split('+')[0]) &
                              (df_out['EmpID'] == int(
                                  Keymax.split('+')[1]))] = time_out
            out1 = df_out['Out'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now())))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))]
            #print(out1)
            delta = pd.to_datetime(out1) - pd.to_datetime(in1)
            #print(delta)
            df_out['Duration'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now())))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))] = delta
            day1 = df_out['Attendance'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now() -
                                           datetime.timedelta(days=1))))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))]
            #print(day1.empty)
            if day1.empty:
                df_out['Attendance'].loc[
                    (df_out['Date'] == pd.to_datetime(
                        datetime.datetime.date(datetime.datetime.now())))
                    & (df_out['EmpName'] == Keymax.split('+')[0]) &
                    (df_out['EmpID'] == int(Keymax.split('+')[1]))] = 1
            else:
                df_out['Attendance'].loc[
                    (df_out['Date'] == pd.to_datetime(
                        datetime.datetime.date(datetime.datetime.now())))
                    & (df_out['EmpName'] == Keymax.split('+')[0]) &
                    (df_out['EmpID'] == int(
                        Keymax.split('+')[1]))] = int(day1[0]) + 1

            df_out.to_csv(r'Attendance_data\out.csv', index=False)
            print(df_out.tail(5))
            exit(0)
    else:
        print("Don't try to give proxy")
        pyttsx3("Don't try to give proxy")
        exit(0)
Example #8
0
import numpy as np
import random
from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity

np.random.seed(123)
random.seed(123)

model = DeepSpeakerModel()
model.m.load_weights(
    '/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5',
    by_name=True)

mfcc_001 = sample_from_mfcc(
    read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE),
    NUM_FRAMES)
mfcc_002 = sample_from_mfcc(
    read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE),
    NUM_FRAMES)

predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))

mfcc_003 = sample_from_mfcc(
    read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES)
predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))

print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002))
import random

import numpy as np

from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity
from batcher import KerasFormatConverter

kc = KerasFormatConverter('./')
# Define the model here.
model = DeepSpeakerModel(include_softmax=False,
                         include_classifier=True,
                         num_speakers_softmax=len(
                             kc.categorical_speakers.speaker_ids))

# Load the checkpoint.
model.m.load_weights('checkpoints-classify/ResCNN_checkpoint_1.h5')

mfcc_001 = sample_from_mfcc(
    read_mfcc('samples/train/0/0/0-0-Recording (12).m4a', SAMPLE_RATE),
    NUM_FRAMES)
predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
print(np.argmax(predict_001[0]))
import numpy as np

from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity
from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt

# Reproducible results.
np.random.seed(123)
random.seed(123)

# Define the model here.
dsm = DeepSpeakerModel(include_softmax=False)
base_model = dsm.m
x = base_model.output
x = Dense(1024, name='shared')(x)
x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln1')(x)
model = Model(base_model.input,x)
# Load the checkpoint.
model.load_weights(CHECKPOINT_PATH, by_name=True)

for i in speaker_u:
	temp_speaker=[]
	for j in range(len(f)):
		if speakers[j]==i:
			temp_speaker.append(f[j])
	for k in range(len(temp_speaker)):
		if k==0: