def get_voice_equality(email): encoder = VoiceEncoder() wav_fpaths = list( Path(f"/home/mohammadali/PycharmProjects/AI_Auth/media/voices/{email}" ).glob("**/*.ogg")) speaker_wavs = { speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in groupby( tqdm( wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"), lambda wav_fpath: wav_fpath.parent.stem) } spk_embeds_a = np.array([ encoder.embed_speaker(wavs[:len(wavs) // 2]) for wavs in speaker_wavs.values() ]) spk_embeds_b = np.array([ encoder.embed_speaker(wavs[len(wavs) // 2:]) for wavs in speaker_wavs.values() ]) spk_sim_matrix = np.inner(spk_embeds_a, spk_embeds_b) print(spk_sim_matrix[0][0]) return spk_sim_matrix[0][0] > 0.93
# already L2-normed. # Short version: utt_sim_matrix = np.inner(embeds_a, embeds_b) # Long, detailed version: utt_sim_matrix2 = np.zeros((len(embeds_a), len(embeds_b))) for i in range(len(embeds_a)): for j in range(len(embeds_b)): # The @ notation is exactly equivalent to np.dot(embeds_a[i], embeds_b[i]) utt_sim_matrix2[i, j] = embeds_a[i] @ embeds_b[j] assert np.allclose(utt_sim_matrix, utt_sim_matrix2) ## Similarity between two speaker embeddings # Divide the utterances of each speaker in groups of identical size and embed each group as a # speaker embedding spk_embeds_a = np.array([encoder.embed_speaker(wavs[:len(wavs) // 2]) \ for wavs in speaker_wavs.values()]) spk_embeds_b = np.array([encoder.embed_speaker(wavs[len(wavs) // 2:]) \ for wavs in speaker_wavs.values()]) spk_sim_matrix = np.inner(spk_embeds_a, spk_embeds_b) ## Draw the plots fix, axs = plt.subplots(2, 2, figsize=(8, 10)) labels_a = ["%s-A" % i for i in speaker_wavs.keys()] labels_b = ["%s-B" % i for i in speaker_wavs.keys()] mask = np.eye(len(utt_sim_matrix), dtype=np.bool) plot_similarity_matrix(utt_sim_matrix, labels_a, labels_b, axs[0, 0], "Cross-similarity between utterances\n(speaker_id-utterance_group)") plot_histograms((utt_sim_matrix[mask], utt_sim_matrix[np.logical_not(mask)]), axs[0, 1], ["Same speaker", "Different speakers"],
target_list = [] for target_path in os.listdir(target_dir): target_path = os.path.join(target_dir, target_path) target_wav, _ = read_wav(target_path, sr=sampling_rate) target_list.append(target_wav) # NCE_list, cyclegan_list, target_list = [], [] ,[] # for NCE_path, cyclegan_path, target_path in zip(os.listdir(NCE_dir), os.listdir(cyclegan_dir), os.listdir(target_dir)): # NCE_path = os.path.join(NCE_dir, NCE_path) # cyclegan_path = os.path.join(cyclegan_dir, cyclegan_path) # target_path = os.path.join(target_dir, target_path) # NCE_wav, _ = read_wav(NCE_path, sr=sampling_rate) # cyclegan_wav, _ = read_wav(cyclegan_path, sr=sampling_rate) # target_wav, _ = read_wav(target_path, sr=sampling_rate) # NCE_list.append(NCE_wav) # cyclegan_list.append(cyclegan_wav) # target_list.append(target_wav) # length = min(len(NCE_list), len(cyclegan_list), len(target_list)) spk_embeds_NCE = np.array([encoder.embed_speaker(NCE_list)]) spk_embeds_cyclegan = np.array([encoder.embed_speaker(cyclegan_list)]) spk_embeds_target = np.array([encoder.embed_speaker(target_list)]) spk_sim_NCE = np.inner(spk_embeds_NCE, spk_embeds_target) spk_sim_cyclegan = np.inner(spk_embeds_cyclegan, spk_embeds_target) print('NCE:{} CycleGAN:{}'.format(spk_sim_NCE, spk_sim_cyclegan))
synthesizer = Synthesizer(synth_path.joinpath("taco_pretrained"), low_mem=False) vocoder.load_model(vocoder_path) print("Loading encoder from resemblyzer") encoder = VoiceEncoder() # Get the reference audio repo path speaker = 'SAM' repo_fpath = Path('../SOURCE_AUDIO',speaker) wav_fpaths = list(repo_fpath.glob(speaker+"*")) print('PAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATHS') print(repo_fpath) print(wav_fpaths) wavs = np.array(list(map(preprocess_wav, tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths))))) speaker_embedding = encoder.embed_speaker(wavs) text = str(np.loadtxt('../test_sentence.txt', dtype='str', delimiter = '&')) texts = [text] embeds = [speaker_embedding] specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") generated_wav = vocoder.infer_waveform(spec) # Save it on the disk fpath = "outputs/test_output_"+ speaker + ".wav"