コード例 #1
0
def process_batch(path, csv_path, checkpoint_tag='last', output_tag=None):
    # Process batch
    batch_output_root = os.path.join(RECONSTRUCTION_ROOT, path)

    reconstructed_paths = glob.glob(
        os.path.join(batch_output_root, '**', 'checkpoint-last.pkl'))
    batch_ids_list = [path.split('/')[-2] for path in reconstructed_paths]
    batch_ids_list.sort()
    for batch_id in tqdm(batch_ids_list, desc="Eval batch"):
        fn = 'speaker_id_all' if args.all_speakers else 'speaker_id_100'
        if output_tag is not None:
            fn += '_%s' % output_tag
        if checkpoint_tag == 'last':
            npy_path = os.path.join(batch_output_root, batch_id, '%s.npy' % fn)
        else:
            npy_path = os.path.join(batch_output_root, batch_id,
                                    '%s-%s.npy' % (fn, checkpoint_tag))
        print("Outputting to %s" % npy_path)
        if os.path.exists(npy_path) and not args.recompute:
            continue

        input_path = os.path.join(csv_path, batch_id + '.csv')
        utts = open(input_path).read().strip().split('\n')[1:]
        utt_ids = [os.path.basename(u.split(',')[0])[:-4] for u in utts]
        speaker_ids = [u.split('-')[0] for u in utt_ids]
        utt_ids = [u.split('-') for u in utt_ids]
        utt_ids = ["%s_%s-%s" % tuple(u) for u in utt_ids]

        y_pred = np.zeros(shape=(len(speaker_ids), num_samples))
        org_y_pred = np.zeros(shape=(len(speaker_ids), num_samples))

        for i in tqdm(range(len(speaker_ids))):
            fn = os.path.join(batch_output_root, batch_id,
                              'checkpoint-last.pkl')
            # original_utt = read_mfcc_from_pkl(os.path.join(batch_output_root, batch_id, '%s_samples.pkl' % batch_id), i, idx=1)
            original_utt = read_mfcc_from_pkl(os.path.join(
                batch_output_root, batch_id, 'samples.pkl'),
                                              i,
                                              idx=1)
            input_data = get_all_speakers_batch(
                speaker_ids[i], utt_ids[i], read_mfcc_from_pkl(fn, i),
                original_utt) if args.all_speakers else get_batch(ids[0], fn)
            predictions = model.m.predict(input_data, batch_size=100)
            reconstructed_embedding = predictions[0]
            anchor_embedding = predictions[1]
            for j, other_than_anchor_embedding in enumerate(
                    predictions[2:]):  # positive + negatives
                y_pred[i][j] = batch_cosine_similarity(
                    [reconstructed_embedding],
                    [other_than_anchor_embedding])[0]
                org_y_pred[i][j] = batch_cosine_similarity(
                    [anchor_embedding], [other_than_anchor_embedding])[0]
            tqdm.write(
                str(np.argsort(y_pred[i])[-5:]) + "\t" +
                str(np.argsort(org_y_pred[i])[-5:]))
        np.save(npy_path, [y_pred, org_y_pred])
コード例 #2
0
    def get_batch(self, batch_size, is_test=False, predict=None):
        if predict is None:
            predict = self.model.m.predict
        from test import batch_cosine_similarity

        num_triplets = batch_size // 3
        inputs = []
        k = 2  # do not change this.
        for speaker in self.speakers_list:
            inputs.append(
                self.select_speaker_data(speaker, n=k, is_test=is_test))
        inputs = np.array(
            inputs)  # num_speakers * [k, num_frames, num_fbanks, 1].
        embeddings = predict(np.vstack(inputs))
        assert embeddings.shape[-1] == 512
        # (speaker, utterance, 512)
        embeddings = np.reshape(embeddings, (len(self.speakers_list), k, 512))
        cs = batch_cosine_similarity(embeddings[:, 0], embeddings[:, 1])
        arg_sort = np.argsort(cs)
        assert len(arg_sort) > num_triplets
        anchor_speakers = arg_sort[0:num_triplets]

        anchor_embeddings = embeddings[anchor_speakers, 0]
        negative_speakers = sorted(
            set(self.speakers_list) - set(anchor_speakers))
        negative_embeddings = embeddings[negative_speakers, 0]

        selected_negative_speakers = []
        for anchor_embedding in anchor_embeddings:
            cs_negative = [
                batch_cosine_similarity([anchor_embedding], neg)
                for neg in negative_embeddings
            ]
            selected_negative_speakers.append(negative_speakers[int(
                np.argmax(cs_negative))])

        # anchor with frame 0.
        # positive with frame 1.
        # negative with frame 0.
        assert len(
            set(selected_negative_speakers).intersection(anchor_speakers)) == 0
        negative = inputs[selected_negative_speakers, 0]
        positive = inputs[anchor_speakers, 1]
        anchor = inputs[anchor_speakers, 0]
        batch_x = np.vstack([anchor, positive, negative])
        batch_y = np.zeros(shape=(len(batch_x), len(self.speakers_list)))
        return batch_x, batch_y
コード例 #3
0
def main():

    model = DeepSpeakerModel()
    model.m.load_weights(
        '/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/checkpoints-triplets/ResCNN_triplet_training_checkpoint_265.h5',
        by_name=True)

    # mfcc_001 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5.wav', SAMPLE_RATE), NUM_FRAMES)
    # mfcc_002 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/5-F-27/5-2.wav', SAMPLE_RATE), NUM_FRAMES)

    # predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
    # predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))

    # mfcc_003 = sample_from_mfcc(read_mfcc('/home/nguyendat/Documents/projects/PetProject/VoiceVerification/deep-speaker/samples/train/6-M-45/6.wav', SAMPLE_RATE), NUM_FRAMES)
    # predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))

    # print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002))
    # print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003))
    features = []
    labels = []
    for x in range(10):
        mfcc1, mfcc2, label = load_data()
        feature1 = model.m.predict(np.expand_dims(mfcc1, axis=0))
        feature2 = model.m.predict(np.expand_dims(mfcc2, axis=0))
        cost = batch_cosine_similarity(feature1, feature2)
        # print(cost)
        features.append(cost[0])
        labels.append(label)
    # print(cost.shape)
    #  load 2 file (random) + label, predict roi dua vao SVM,
    # dung den triplet
    # features = feature1 + feature2
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVC
    features = np.array(features)
    labels = np.array(labels)
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    clf.fit(features, labels)
    svm_pickle = open('svm.pkl', 'wb')
    pickle.dump(clf, svm_pickle)
    svm_pickle.close()
コード例 #4
0
ファイル: check.py プロジェクト: Shennor/vrec_pp
def find_statistics(pred_tensor, basepath):
    samples = find_files(basepath, 'npy')
    base = dict()
    #{speaker: [cosine sum, number of utterances, max cosine]}
    for sample in samples:
        ensure_dir_for_filename(sample)
        sp = sample.split('/')[-2]
        #print(sp)
        if not(sp in base):
            base[sp] = [0, 0, -1]
        samp_tensor = load_npy(sample)
        cos = batch_cosine_similarity(pred_tensor, samp_tensor)
        base[sp][0] += cos;
        base[sp][1] += 1
        if(cos > base[sp][2]):
            base[sp][2] = cos
    res = dict()
    #{speaker: [average cosine, max cosine]}
    for key in base:
        average = base[key][0]/base[key][1]
        res[key] = [average, base[key][2]]
    return res
コード例 #5
0
def play():

    text = None
    out_file = r"D:/Projects/Internship/samtest/file_out.wav"
    rootdir = os.path.join(os.getcwd(), 'samples')
    attendance_file_path = os.path.join(os.getcwd(), 'Attendance_data\out.csv')

    def print_data(info):
        with open(r'\Attendance_data\out.csv', 'rb') as handle:
            unserialized_data = csv.reader(handle)
            print(info, unserialized_data)

    # if data doesn't exist
    if not os.path.exists(attendance_file_path) and not os.path.isfile(
            attendance_file_path):
        if not os.path.exists('Attendance_data'):
            os.makedirs('Attendance_data')
            d = {
                'Date': [],
                'EmpName': [],
                'EmpID': [],
                'In': [],
                'Out': [],
                'Duration': [],
                'Attendance': []
            }
            df = pd.DataFrame(data=d)
            print('\nCreating New Attendance DataFrame : ')
            print(df)
            df.to_csv(r'Attendance_data\out.csv', index=False)
        #print_data('Data is created : \n')

    # compression_opts = dict(method='zip',
    #                         archive_name='out.csv')
    # df.to_csv('out.zip', index=False,
    #           compression=compression_opts)

    names = []

    for subdir, dirs, files in os.walk(rootdir):
        for dir_name in dirs:
            names.append(dir_name)

    class bcolors:
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKCYAN = '\033[96m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'

    def pyttsx3(text):
        # obtain voice property
        voices = engine.getProperty('voices')
        # voice id 1 is for female and 0 for male
        engine.setProperty('voice', voices[1].id)
        # convert to audio and play
        engine.say(text)
        engine.runAndWait()

    print(
        bcolors.OKGREEN +
        "\n\nWelcome to Attendance System based on Speaker Recognition.\n\nRules are simple, say your name and roll num and the attendance will be updated.\n"
    )
    pyttsx3(
        "Welcome to Attendance System based on Speaker Recognition. Rules are simple, say your name and roll num and the attendance will be updated. Warning: Don't try to give proxy"
    )
    print(bcolors.WARNING + "Warning: Don't try to give proxy" + bcolors.ENDC +
          "\n")
    audio = pyaudio.PyAudio()

    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 12
    # start Recording
    stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)
    r = sr.Recognizer()
    print("Speak something...\n")
    pyttsx3(
        "The recording has started, please say Hello ewarn,along with your name and employee ID and if you are signing in or out"
    )

    frames = []
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    pyttsx3(
        "The recording has completed, and now your information will be updated, please be patient and if you feel there is an error kindly contact the adminstrator"
    )
    print("Recording saved\n")
    # stop Recording
    stream.stop_stream()
    stream.close()
    audio.terminate()

    waveFile = wave.open(out_file, 'wb')
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b''.join(frames))
    waveFile.close()

    with sr.AudioFile(out_file) as source:
        #print("Say something!")
        audio = r.record(source)  # read the entire audio file
    try:
        # for testing purposes, we're just using the default API key
        # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
        # instead of `r.recognize_google(audio)`
        #print("Did you say? " + r.recognize_google(audio))
        text = r.recognize_google(audio)
    except sr.UnknownValueError:
        print("eWarn could not understand audio")

    if "hello" not in text:
        print("Trigger word missing, Please try again")
        pyttsx3("Trigger word missing, Please try again")
        exit(0)

    # Reproducible results.
    np.random.seed(123)
    random.seed(123)

    # Define the model here.
    model = DeepSpeakerModel()

    # Load the checkpoint.
    model.m.load_weights('Model.h5', by_name=True)

    mfcc_005 = sample_from_mfcc(read_mfcc(out_file, SAMPLE_RATE), NUM_FRAMES)

    # Call the model to get the embeddings of shape (1, 512) for each file.
    predict_005 = model.m.predict(np.expand_dims(mfcc_005, axis=0))

    #names = []
    select = dict()

    from statistics import mean

    for subdir, dirs, files in os.walk(rootdir):
        for dir_name in dirs:
            #names.append(dir_name)
            #print('person dir : ', dir_name)
            #print('person dir files : \n', os.listdir(os.path.join(rootdir, dir_name)))
            select_list = list()
            for file_name in os.listdir(os.path.join(rootdir, dir_name)):
                #print(file_name)
                #print('person dir files seperate : \n', os.path.join(rootdir, dir_name, file_name))
                mfcc_001 = sample_from_mfcc(
                    read_mfcc(os.path.join(rootdir, dir_name, file_name),
                              SAMPLE_RATE), NUM_FRAMES)
                predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))

                select_list.append(
                    batch_cosine_similarity(predict_005, predict_001)[0])

            #print(select_list)
            select[dir_name] = mean(select_list)
            select_list.clear()

    #print('Names : ', names)
    print('\nPredcitions :', select)
    Keymax = max(select, key=select.get)

    if (select[Keymax]) >= 0.5:
        print('The Speaker is: ', Keymax.split('+')[0])
        pyttsx3('The Speaker is ' + str(Keymax.split('+')[0]))
        time_in = None
        time_out = None

        #'EmpName': [], 'EmpID':[], 'In':[], 'Out':[], 'Duration':[], 'Attendance':[]}
        if text.lower().split().count('in') == 1:
            #print('text has in', text)
            time_in = datetime.datetime.now()
            print("Current time for in:-", time_in)

            df_in = pd.read_csv(attendance_file_path, parse_dates=['Date'])
            temp_in = {'Date': datetime.datetime.date(time_in), 'EmpName': Keymax.split('+')[0], 'EmpID': Keymax.split('+')[1], \
          'In': time_in, 'Out': 'zero', 'Duration': 'zero', 'Attendance': 'zero'}
            temp_df = pd.DataFrame(temp_in, index=[0])
            #print("temp_in", temp_in)
            #print("temp_df", temp_df)
            if not df_in.empty:
                print('DataFrame is not empty!')
                #df_in.append(temp_df, ignore_index = True)
                print('\n\nIN Before Update\n', df_in)
                df3 = pd.concat([df_in, temp_df], ignore_index=True)
                df3.reset_index()
                df3.to_csv(r'Attendance_data\out.csv', index=False)
                print('\n\ndf3\n', df3.tail(5))
            if df_in.empty:
                print('DataFrame is empty!')
                #df_new = pd.DataFrame(temp_in)
                temp_df.to_csv(r'Attendance_data\out.csv', index=False)
                print('After IN Update', temp_df)
                exit(0)

        if text.lower().split().count('out') == 1:
            #print('Text has out')
            df_out = pd.read_csv(attendance_file_path, parse_dates=['Date'])
            #print(df_out)
            time_out = datetime.datetime.now()
            print("Current time for out:-", time_out)
            in1 = df_out['In'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now())))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))]
            #print(in1)
            df_out['Out'].loc[(df_out['Date'] == pd.to_datetime(
                datetime.datetime.date(datetime.datetime.now())))
                              & (df_out['EmpName'] == Keymax.split('+')[0]) &
                              (df_out['EmpID'] == int(
                                  Keymax.split('+')[1]))] = time_out
            out1 = df_out['Out'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now())))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))]
            #print(out1)
            delta = pd.to_datetime(out1) - pd.to_datetime(in1)
            #print(delta)
            df_out['Duration'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now())))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))] = delta
            day1 = df_out['Attendance'].loc[
                (df_out['Date'] == pd.to_datetime(
                    datetime.datetime.date(datetime.datetime.now() -
                                           datetime.timedelta(days=1))))
                & (df_out['EmpName'] == Keymax.split('+')[0]) &
                (df_out['EmpID'] == int(Keymax.split('+')[1]))]
            #print(day1.empty)
            if day1.empty:
                df_out['Attendance'].loc[
                    (df_out['Date'] == pd.to_datetime(
                        datetime.datetime.date(datetime.datetime.now())))
                    & (df_out['EmpName'] == Keymax.split('+')[0]) &
                    (df_out['EmpID'] == int(Keymax.split('+')[1]))] = 1
            else:
                df_out['Attendance'].loc[
                    (df_out['Date'] == pd.to_datetime(
                        datetime.datetime.date(datetime.datetime.now())))
                    & (df_out['EmpName'] == Keymax.split('+')[0]) &
                    (df_out['EmpID'] == int(
                        Keymax.split('+')[1]))] = int(day1[0]) + 1

            df_out.to_csv(r'Attendance_data\out.csv', index=False)
            print(df_out.tail(5))
            exit(0)
    else:
        print("Don't try to give proxy")
        pyttsx3("Don't try to give proxy")
        exit(0)
コード例 #6
0
from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity

np.random.seed(123)
random.seed(123)

model = DeepSpeakerModel()
model.m.load_weights(
    '/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5',
    by_name=True)

mfcc_001 = sample_from_mfcc(
    read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE),
    NUM_FRAMES)
mfcc_002 = sample_from_mfcc(
    read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE),
    NUM_FRAMES)

predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))

mfcc_003 = sample_from_mfcc(
    read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES)
predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))

print('SAME SPEAKER', batch_cosine_similarity(predict_001, predict_002))
print('DIFF SPEAKER', batch_cosine_similarity(predict_001, predict_003))
コード例 #7
0
def distinguish(file_path, voice_root_path):
    try:
        np.random.seed(123)
        random.seed(123)

        interpreter = tf.lite.Interpreter(
            model_path=join(dirname(__file__), 'model.tflite'))
        interpreter.allocate_tensors()

        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()

        if os.path.exists(file_path):
            test_audio = sample_from_mfcc(read_mfcc(file_path, SAMPLE_RATE),
                                          NUM_FRAMES)
            test_audio = test_audio.astype(np.float32)
            interpreter.set_tensor(input_details[0]['index'],
                                   np.expand_dims(test_audio, axis=0))
            interpreter.invoke()
            test_predict = interpreter.get_tensor(output_details[0]['index'])

            all_audio = []
            for root, dirs, files in os.walk(
                    join(voice_root_path, 'wave_numpy')):
                root = root.replace('\\', '/')
                for file in files:
                    if file.endswith('npy'):
                        all_audio.append(
                            (root + '/' + file, np.load(root + '/' + file)))

            if len(all_audio) > 0:
                print('use exist numpy')
                result = []
                for audio in all_audio:
                    result.append(
                        (audio[0],
                         batch_cosine_similarity(test_predict, audio[1])))

            else:
                print('use original corpus')
                all_addr = []
                for root, dirs, files in os.walk(
                        join(voice_root_path, 'wave_original')):
                    root = root.replace('\\', '/')
                    for file in files:
                        if file.endswith('flac') or file.endswith('wav'):
                            all_addr.append(root + '/' + file)

                audio_all = []
                for addr in all_addr:
                    audio = sample_from_mfcc(read_mfcc(addr, SAMPLE_RATE),
                                             NUM_FRAMES)
                    audio = audio.astype(np.float32)
                    interpreter.set_tensor(input_details[0]['index'],
                                           np.expand_dims(audio, axis=0))
                    interpreter.invoke()
                    predict_one = interpreter.get_tensor(
                        output_details[0]['index'])
                    audio_all.append((addr, predict_one))

                result = []
                for audio in audio_all:
                    result.append(
                        (audio[0],
                         batch_cosine_similarity(test_predict, audio[1])))

            cos_max = (result[0][1], result[0][0])
            for i, print_out in enumerate(result):
                if print_out[1] > cos_max[0][0]:
                    cos_max = (print_out[1], print_out[0])

            if cos_max[0] > 0.60:
                return out_format(cos_max[1], cos_max[0].item())
            else:
                return 'dont exist', 0
        else:
            return 'no wave input', 0
    except Exception:
        return 'error', 0
コード例 #8
0
                     by_name=True)

# Sample some inputs for WAV/FLAC files for the same speaker.
# To have reproducible results every time you call this function, set the seed every time before calling it.
# np.random.seed(123)
# random.seed(123)
s1 = 'samples/LibriSpeechSamples/27/124992/27-124992-0022.wav'
s2 = 'samples/LibriSpeechSamples/27/124992/27-124992-0000.wav'
s3 = 'samples/LibriSpeechSamples/26/496/26-496-0026.wav'

mfcc_001 = sample_from_mfcc(read_mfcc(s1, SAMPLE_RATE), NUM_FRAMES)
mfcc_002 = sample_from_mfcc(read_mfcc(s2, SAMPLE_RATE), NUM_FRAMES)

# Call the model to get the embeddings of shape (1, 512) for each file.
predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))

# Do it again with a different speaker.
mfcc_003 = sample_from_mfcc(read_mfcc(s3, SAMPLE_RATE), NUM_FRAMES)
predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))

# Compute the cosine similarity and check that it is higher for the same speaker.
print(s1, ' & ', s2)
print('SAME SPEAKER',
      batch_cosine_similarity(predict_001,
                              predict_002))  # SAME SPEAKER [0.81564593]
print(s1, ' & ', s3)
print('DIFF SPEAKER',
      batch_cosine_similarity(predict_001,
                              predict_003))  # DIFF SPEAKER [0.1419204]
コード例 #9
0
ファイル: example.py プロジェクト: wy192/deep-speaker
# Load the checkpoint.
model.m.load_weights('ResCNN_triplet_training_checkpoint_265.h5', by_name=True)

# Sample some inputs for WAV/FLAC files for the same speaker.
# To have reproducible results every time you call this function, set the seed every time before calling it.
# np.random.seed(123)
# random.seed(123)
mfcc_001 = sample_from_mfcc(
    read_mfcc('samples/PhilippeRemy/PhilippeRemy_001.wav', SAMPLE_RATE),
    NUM_FRAMES)
mfcc_002 = sample_from_mfcc(
    read_mfcc('samples/PhilippeRemy/PhilippeRemy_002.wav', SAMPLE_RATE),
    NUM_FRAMES)

# Call the model to get the embeddings of shape (1, 512) for each file.
predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))

# Do it again with a different speaker.
mfcc_003 = sample_from_mfcc(
    read_mfcc('samples/1255-90413-0001.flac', SAMPLE_RATE), NUM_FRAMES)
predict_003 = model.m.predict(np.expand_dims(mfcc_003, axis=0))

# Compute the cosine similarity and check that it is higher for the same speaker.
same_speaker_similarity = batch_cosine_similarity(predict_001, predict_002)
diff_speaker_similarity = batch_cosine_similarity(predict_001, predict_003)
print('SAME SPEAKER', same_speaker_similarity)  # SAME SPEAKER [0.81564593]
print('DIFF SPEAKER', diff_speaker_similarity)  # DIFF SPEAKER [0.1419204]

assert same_speaker_similarity > diff_speaker_similarity
			temp_speaker.append(f[j])
	for k in range(len(temp_speaker)):
		if k==0:
			enroll_speaker.append(temp_speaker[k])
			speakerID_enroll.append(i)
		else:
			test_speaker.append(temp_speaker[k])
			speakerID_test.append(i)

count=0
for i in range(len(test_speaker)):
	mfcc_test=sample_from_mfcc(read_mfcc(test_speaker[i], SAMPLE_RATE), NUM_FRAMES)
	predict_002 =model.predict(np.expand_dims(mfcc_test, axis=0))
	print(predict_002.shape)
	max_score= -10**8
	pred_speaker=None
	true_speaker=speakerID_test[i]
	for j in range(len(enroll_speaker)):
		mfcc_enroll=sample_from_mfcc(read_mfcc(enroll_speaker[j], SAMPLE_RATE), NUM_FRAMES)
		predict_001 =model.predict(np.expand_dims(mfcc_enroll, axis=0))
		score=batch_cosine_similarity(predict_001, predict_002)
		if score>max_score:
			max_score=score
			pred_speaker=speakerID_enroll[j]
	print("True speaker : %s\nPredicted speaker : %s\nResult : %s\n" %(true_speaker, pred_speaker, true_speaker==pred_speaker))
	if pred_speaker==true_speaker:
		count+=1
print('accuracy: ',count/len(test_speaker))


コード例 #11
0
ファイル: batcher.py プロジェクト: zhangfaquan/deep-speaker
    def get_batch_train(self, batch_size):
        from test import batch_cosine_similarity
        s1 = time()
        self.batch_count += 1
        if self.batch_count % self.history_every == 0:
            self.update_triplets_history()

        all_indexes = range(len(self.history_embeddings_train))
        anchor_indexes = np.random.choice(a=all_indexes, size=batch_size // 3, replace=False)

        s2 = time()
        similar_negative_indexes = []
        dissimilar_positive_indexes = []
        # could be made parallel.
        for anchor_index in anchor_indexes:
            s21 = time()
            anchor_embedding = self.history_embeddings[anchor_index]
            anchor_speaker = extract_speaker(self.history_utterances[anchor_index])

            # why self.nb_speakers // 2? just random. because it is fast. otherwise it's too much.
            negative_indexes = [j for (j, a) in enumerate(self.history_utterances)
                                if extract_speaker(a) != anchor_speaker]
            negative_indexes = np.random.choice(negative_indexes, size=self.nb_speakers // 2)

            s22 = time()

            anchor_embedding_tile = [anchor_embedding] * len(negative_indexes)
            anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[negative_indexes])

            s23 = time()
            similar_negative_index = negative_indexes[np.argsort(anchor_cos)[-1]]  # [-1:]
            similar_negative_indexes.append(similar_negative_index)

            s24 = time()
            positive_indexes = [j for (j, a) in enumerate(self.history_utterances) if
                                extract_speaker(a) == anchor_speaker and j != anchor_index]
            s25 = time()
            anchor_embedding_tile = [anchor_embedding] * len(positive_indexes)
            s26 = time()
            anchor_cos = batch_cosine_similarity(anchor_embedding_tile, self.history_embeddings[positive_indexes])
            dissimilar_positive_index = positive_indexes[np.argsort(anchor_cos)[0]]  # [:1]
            dissimilar_positive_indexes.append(dissimilar_positive_index)
            s27 = time()

        s3 = time()
        batch_x = np.vstack([
            self.history_model_inputs[anchor_indexes],
            self.history_model_inputs[dissimilar_positive_indexes],
            self.history_model_inputs[similar_negative_indexes]
        ])

        s4 = time()

        # for anchor, positive, negative in zip(history_utterances[anchor_indexes],
        #                                       history_utterances[dissimilar_positive_indexes],
        #                                       history_utterances[similar_negative_indexes]):
        # print('anchor', os.path.basename(anchor),
        #       'positive', os.path.basename(positive),
        #       'negative', os.path.basename(negative))
        # print('_' * 80)

        # assert utterances as well positive != anchor.
        anchor_speakers = [extract_speaker(a) for a in self.history_utterances[anchor_indexes]]
        positive_speakers = [extract_speaker(a) for a in self.history_utterances[dissimilar_positive_indexes]]
        negative_speakers = [extract_speaker(a) for a in self.history_utterances[similar_negative_indexes]]

        assert len(anchor_indexes) == len(dissimilar_positive_indexes)
        assert len(similar_negative_indexes) == len(dissimilar_positive_indexes)
        assert list(self.history_utterances[dissimilar_positive_indexes]) != list(
            self.history_utterances[anchor_indexes])
        assert anchor_speakers == positive_speakers
        assert negative_speakers != anchor_speakers

        batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.

        for a in anchor_speakers:
            self.metadata_train_speakers[a] += 1
        for a in positive_speakers:
            self.metadata_train_speakers[a] += 1
        for a in negative_speakers:
            self.metadata_train_speakers[a] += 1

        s5 = time()
        # print('1-2', s2 - s1)
        # print('2-3', s3 - s2)
        # print('3-4', s4 - s3)
        # print('4-5', s5 - s4)
        # print('21-22', (s22 - s21) * (batch_size // 3))
        # print('22-23', (s23 - s22) * (batch_size // 3))
        # print('23-24', (s24 - s23) * (batch_size // 3))
        # print('24-25', (s25 - s24) * (batch_size // 3))
        # print('25-26', (s26 - s25) * (batch_size // 3))
        # print('26-27', (s27 - s26) * (batch_size // 3))

        return batch_x, batch_y
コード例 #12
0
# Define the model here.
#model = DeepSpeakerModel()

# Load the checkpoint.
#model.m.load_weights('ResCNN_triplet_training_checkpoint_265.h5', by_name=True)

df = pd.read_csv('../dataset_3_consolidado/todos.csv')

resultados = []
for i, linha in df.iterrows():
    for j in range(i, df.last_valid_index()):
        linha2 = df.loc[j]
        embeddings_1 = np.load('../'+linha['path'].replace('.wav', '.npy'))
        embeddings_2 = np.load('../'+linha2['path'].replace('.wav', '.npy'))
        score = batch_cosine_similarity(embeddings_1, embeddings_2)
        resultados.append([linha['voz'], linha['classe'], linha2['voz'], linha2['classe'], score])

df_resultado = pd.DataFrame(resultados, columns=['voz-1', 'classe-1', 'voz-2', 'classe-2','score'])
df_resultado.to_csv('resultado.csv', index=False)





# mfcc_001 = sample_from_mfcc(read_mfcc('../data/NORMAL/GM_NORMAL/GM6_NORMAL.wav', SAMPLE_RATE), NUM_FRAMES)
# mfcc_002 = sample_from_mfcc(read_mfcc('../data/DISFARCE/GM_DISFARCE/GM6_DISFARCE.wav', SAMPLE_RATE), NUM_FRAMES)

# # Call the model to get the embeddings of shape (1, 512) for each file.
# predict_001 = model.m.predict(np.expand_dims(mfcc_001, axis=0))
# predict_002 = model.m.predict(np.expand_dims(mfcc_002, axis=0))
コード例 #13
0
ファイル: backend.py プロジェクト: kareemamrr/deep-speaker
 def get_score(embeds):
     return batch_cosine_similarity(embeds[0], embeds[1])
コード例 #14
0
def process_single_utts(path, checkpoint_tag='last', output_tag=None):
    # Process single utts
    root = os.path.join(RECONSTRUCTION_ROOT,
                        path) if path else RECONSTRUCTION_ROOT
    reconstructed_paths = glob.glob(
        os.path.join(root, '**', 'checkpoint-%s.pkl' % checkpoint_tag))
    ids_list = [path.split('/')[-2] for path in reconstructed_paths]
    ids_list.sort()
    for i, utt_id in tqdm(list(enumerate(ids_list)), desc='Eval'):
        fn = 'speaker_id_all' if args.all_speakers else 'speaker_id_100'
        if output_tag is not None:
            fn += '_%s' % output_tag
        if checkpoint_tag == 'last':
            npy_path = os.path.join(root, utt_id, '%s.npy' % fn)
        else:
            npy_path = os.path.join(root, utt_id,
                                    '%s-%s.npy' % (fn, checkpoint_tag))
        print("Outputting to %s" % npy_path)

        if os.path.exists(npy_path) and not args.recompute:
            continue

        ids = re.split('-|_', utt_id)

        y_pred = np.zeros(shape=num_samples)
        org_y_pred = np.zeros(shape=num_samples)

        # fn = sorted(list(glob.glob('/home/trungvd/repos/speech-reconstruction/outputs/librispeech/' + '-'.join(ids) + '/checkpoint-*.pkl')))[-1]
        fn = os.path.join(root, utt_id, 'checkpoint-%s.pkl' % checkpoint_tag)
        if not os.path.exists(fn):
            fn = os.path.join(root, utt_id,
                              'checkpoint-%s.pkl' % checkpoint_tag)

        reconstructed_utt = read_mfcc_from_pkl(fn)
        original_utt = read_mfcc_from_pkl(os.path.join(
            RECONSTRUCTION_ROOT, "outputs", "librispeech", utt_id,
            '%s_samples.pkl' % utt_id),
                                          0,
                                          idx=1)
        # original_utt = read_mfcc_from_pkl(os.path.join(RECONSTRUCTION_ROOT, "outputs", "librispeech", utt_id, 'samples.pkl'), 0, idx=1)
        input_data = get_all_speakers_batch(
            ids[0], utt_id, reconstructed_utt,
            original_utt) if args.all_speakers else get_batch(ids[0], fn)
        predictions = model.m.predict(input_data, batch_size=100)
        reconstructed_embeddings = predictions[:1]
        anchor_embedding = predictions[1]
        for j, other_than_anchor_embedding in enumerate(
                predictions[2:]):  # positive + negatives
            y_pred[j] = np.max([
                batch_cosine_similarity([reconstructed_embedding],
                                        [other_than_anchor_embedding])[0]
                for reconstructed_embedding in reconstructed_embeddings
            ], 0)
            org_y_pred[j] = batch_cosine_similarity(
                [anchor_embedding], [other_than_anchor_embedding])[0]

        normalize = lambda x: (x - np.mean(x)) / np.var(x)
        tqdm.write('\t'.join([
            utt_id,
            "pred: " + str(np.argsort(y_pred)[-5:]),
            str(y_pred[np.argsort(y_pred)[-5:]]),
            "org: " + str(np.argsort(org_y_pred)[-5:]),
            str(org_y_pred[np.argsort(org_y_pred)[-5:]]),
            # "mae: " + str(np.average(np.abs(normalize(reconstructed_utt) - normalize(original_utt))))
        ]))
        np.save(npy_path, [y_pred, org_y_pred])