Exemple #1
0
def get_voice_equality(email):
    encoder = VoiceEncoder()

    wav_fpaths = list(
        Path(f"/home/mohammadali/PycharmProjects/AI_Auth/media/voices/{email}"
             ).glob("**/*.ogg"))

    speaker_wavs = {
        speaker: list(map(preprocess_wav, wav_fpaths))
        for speaker, wav_fpaths in groupby(
            tqdm(
                wav_fpaths, "Preprocessing wavs", len(wav_fpaths),
                unit="wavs"), lambda wav_fpath: wav_fpath.parent.stem)
    }

    spk_embeds_a = np.array([
        encoder.embed_speaker(wavs[:len(wavs) // 2])
        for wavs in speaker_wavs.values()
    ])
    spk_embeds_b = np.array([
        encoder.embed_speaker(wavs[len(wavs) // 2:])
        for wavs in speaker_wavs.values()
    ])
    spk_sim_matrix = np.inner(spk_embeds_a, spk_embeds_b)

    print(spk_sim_matrix[0][0])
    return spk_sim_matrix[0][0] > 0.93
# already L2-normed.
# Short version:
utt_sim_matrix = np.inner(embeds_a, embeds_b)
# Long, detailed version:
utt_sim_matrix2 = np.zeros((len(embeds_a), len(embeds_b)))
for i in range(len(embeds_a)):
    for j in range(len(embeds_b)):
        # The @ notation is exactly equivalent to np.dot(embeds_a[i], embeds_b[i])
        utt_sim_matrix2[i, j] = embeds_a[i] @ embeds_b[j]
assert np.allclose(utt_sim_matrix, utt_sim_matrix2)


## Similarity between two speaker embeddings
# Divide the utterances of each speaker in groups of identical size and embed each group as a
# speaker embedding
spk_embeds_a = np.array([encoder.embed_speaker(wavs[:len(wavs) // 2]) \
                         for wavs in speaker_wavs.values()])
spk_embeds_b = np.array([encoder.embed_speaker(wavs[len(wavs) // 2:]) \
                         for wavs in speaker_wavs.values()])
spk_sim_matrix = np.inner(spk_embeds_a, spk_embeds_b)


## Draw the plots
fix, axs = plt.subplots(2, 2, figsize=(8, 10))
labels_a = ["%s-A" % i for i in speaker_wavs.keys()]
labels_b = ["%s-B" % i for i in speaker_wavs.keys()]
mask = np.eye(len(utt_sim_matrix), dtype=np.bool)
plot_similarity_matrix(utt_sim_matrix, labels_a, labels_b, axs[0, 0],
                       "Cross-similarity between utterances\n(speaker_id-utterance_group)")
plot_histograms((utt_sim_matrix[mask], utt_sim_matrix[np.logical_not(mask)]), axs[0, 1],
                ["Same speaker", "Different speakers"], 
target_list = []
for target_path in os.listdir(target_dir):
    target_path = os.path.join(target_dir, target_path)
    target_wav, _ = read_wav(target_path, sr=sampling_rate)
    target_list.append(target_wav)

# NCE_list, cyclegan_list, target_list = [], [] ,[]
# for NCE_path, cyclegan_path, target_path in zip(os.listdir(NCE_dir), os.listdir(cyclegan_dir), os.listdir(target_dir)):
# 	NCE_path = os.path.join(NCE_dir, NCE_path)
# 	cyclegan_path = os.path.join(cyclegan_dir, cyclegan_path)
# 	target_path = os.path.join(target_dir, target_path)

# 	NCE_wav, _ = read_wav(NCE_path, sr=sampling_rate)
# 	cyclegan_wav, _ = read_wav(cyclegan_path, sr=sampling_rate)
# 	target_wav, _ = read_wav(target_path, sr=sampling_rate)

# 	NCE_list.append(NCE_wav)
# 	cyclegan_list.append(cyclegan_wav)
# 	target_list.append(target_wav)

# length = min(len(NCE_list), len(cyclegan_list), len(target_list))

spk_embeds_NCE = np.array([encoder.embed_speaker(NCE_list)])
spk_embeds_cyclegan = np.array([encoder.embed_speaker(cyclegan_list)])
spk_embeds_target = np.array([encoder.embed_speaker(target_list)])

spk_sim_NCE = np.inner(spk_embeds_NCE, spk_embeds_target)
spk_sim_cyclegan = np.inner(spk_embeds_cyclegan, spk_embeds_target)

print('NCE:{} CycleGAN:{}'.format(spk_sim_NCE, spk_sim_cyclegan))
Exemple #4
0
  synthesizer = Synthesizer(synth_path.joinpath("taco_pretrained"), low_mem=False)
  vocoder.load_model(vocoder_path)
  
  print("Loading encoder from resemblyzer")
  encoder = VoiceEncoder()
  
  # Get the reference audio repo path
  speaker = 'SAM'
  repo_fpath = Path('../SOURCE_AUDIO',speaker)
  wav_fpaths = list(repo_fpath.glob(speaker+"*"))
  print('PAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATHS')
  print(repo_fpath)
  print(wav_fpaths)
  
  wavs = np.array(list(map(preprocess_wav, tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths)))))
  speaker_embedding = encoder.embed_speaker(wavs)
  
  text = str(np.loadtxt('../test_sentence.txt', dtype='str', delimiter = '&'))
 
  texts = [text]
  embeds = [speaker_embedding]
  specs = synthesizer.synthesize_spectrograms(texts, embeds)
  spec = specs[0]
  print("Created the mel spectrogram")
  
  
  generated_wav = vocoder.infer_waveform(spec)
  
  
  # Save it on the disk
  fpath = "outputs/test_output_"+ speaker + ".wav"