def convert_two(model, uttr_org, uttr_trg): spect_vc = [] #spect_vc.append( ("uttr_org", uttr_org) ) #spect_vc.append( ("uttr_trg", uttr_trg) ) uttr_trg, _ = pad_seq(uttr_trg, 32) uttr_org, _ = pad_seq(uttr_org, 32) trg_enc = preprocess_wav("./demo/data/22618_01.wav") trg_enc = encoder.embed_utterance(trg_enc) org_enc = preprocess_wav("./demo/data/22617_01.wav") org_enc = encoder.embed_utterance(org_enc) uttr_trg = torch.FloatTensor(uttr_trg).to(device).double().unsqueeze(0) uttr_org = torch.FloatTensor(uttr_org).to(device).double().unsqueeze(0) org_enc = torch.FloatTensor(org_enc).to(device).double().unsqueeze(0) trg_enc = torch.FloatTensor(trg_enc).to(device).double().unsqueeze(0) with torch.no_grad(): _, x_identic_psnt, _ = model(uttr_org, org_enc, trg_enc) res = x_identic_psnt[0, 0, :, :].cpu().numpy() spect_vc.append(('fin_conversion', res)) return spect_vc
def vectorExtract(audio1_path, audio2_path): wav1 = preprocess_wav(Path(audio1_path)) wav2 = preprocess_wav(Path(audio2_path)) encoder = VoiceEncoder() embed1 = encoder.embed_utterance(wav1) embed2 = encoder.embed_utterance(wav2) return numpy.concatenate([embed1, embed2])
def simVoice(audio1_path, audio2_path): wav1 = preprocess_wav(Path(audio1_path)) wav2 = preprocess_wav(Path(audio2_path)) encoder = VoiceEncoder() embed1 = encoder.embed_utterance(wav1) embed2 = encoder.embed_utterance(wav2) return dot(embed1, embed2) / (norm(embed1) * norm(embed2) ) #np.inner(embed1, embed2)
def get_metrics(user1, user2, inferencer): phrases1 = {} phrases2 = {} with open(f'{ROOT_PATH}/{user1}/metadata.csv', 'r') as ttf: for line in ttf: cols = line.split('|') wav_file = os.path.join(ROOT_PATH, user1, cols[0]) if not wav_file.endswith('.wav'): wav_file += '.wav' text = cols[1].strip() text = text.strip(',') phrases1[text] = wav_file with open(f'{ROOT_PATH}/{user2}/metadata.csv', 'r') as ttf: for line in ttf: cols = line.split('|') wav_file = os.path.join(ROOT_PATH, user2, cols[0]) if not wav_file.endswith('.wav'): wav_file += '.wav' text = cols[1].strip() text = text.strip(',') phrases2[text] = wav_file inter = set(phrases1.keys()).intersection(phrases2.keys()) inter = list(inter) inter = np.random.choice(inter, 10) wavs_1 = [] wavs_2 = [] generated_1 = [] generated_2 = [] for phr in inter: wavs_1.append(res.preprocess_wav(phrases1[phr])) wavs_2.append(res.preprocess_wav(phrases2[phr])) generated_1.append(inferencer.get_json_output(user1, phr, OUT_PATH, save_wavs=False)) generated_2.append(inferencer.get_json_output(user2, phr, OUT_PATH, save_wavs=False)) generated_1 = list(map(res.preprocess_wav, generated_1)) generated_2 = list(map(res.preprocess_wav, generated_2)) o_embed_1 = encoder.embed_speaker(wavs_1) o_embed_2 = encoder.embed_speaker(wavs_2) g_embed_1 = encoder.embed_speaker(generated_1) g_embed_2 = encoder.embed_speaker(generated_2) return np.inner(o_embed_1, o_embed_2), np.linalg.norm(o_embed_1 - o_embed_2), \ np.inner(g_embed_1, o_embed_1), np.linalg.norm(o_embed_1 - g_embed_1), \ np.inner(g_embed_2, o_embed_2), np.linalg.norm(o_embed_2 - g_embed_2)
def load_data(from_path=None, ckpt_path=None, data_path=None, save_path=None): if from_path is None: if ckpt_path is None: raise Exception('No checkpoint path provided') from resemblyzer import preprocess_wav, VoiceEncoder from tqdm import tqdm device = torch.device('cuda') encoder = VoiceEncoder(device=device, loss_device=device) encoder.load_ckpt(ckpt_path, device=device) encoder.eval() wav_fpaths = list(Path(data_path).glob("**/*.flac")) # Preprocess and save encoded utterance and label to list X = [] y = [] for wav_fpath in tqdm(wav_fpaths): wav = preprocess_wav(wav_fpath) X.append(encoder.embed_utterance(wav).cpu().numpy()) y.append(wav_fpath.parent.parent.stem) # Save for testing if save_path is not None: np.save(Path(save_path, 'embeds.npy'), X) np.save(Path(save_path, 'labels.npy'), y) else: raise Exception('No save_path provided') else: X = np.load(Path(from_path, 'embeds.npy'), allow_pickle=True) y = np.load(Path(from_path, 'labels.npy'), allow_pickle=True) return X, y
def make_one_dataset(filename, parameters, total, display=False): global finish sub_filename = filename.strip().split('/')[-1] speaker_id, utt_id = re.match( r'p(\d+)_(\d+)\.wav', sub_filename).groups() # format: p{speaker}_{sid}.wav #mel_spec, lin_spec = get_spectrograms(filename) mel_spec, lin_spec, mfcc, f0, audio = wav2spectrogram(filename, parameters, display=display) wav = preprocess_wav(Path(filename)) d_mel = d_wav2spec(wav) print( '[Processor] - processing {}/{} s{}-{} | d_mel: {} | mel:{} | lin:{} | audio:{}' .format(finish * WORKERS, total, speaker_id, utt_id, d_mel.shape, mel_spec.shape, lin_spec.shape, audio.shape), end='\r') result = {} result['speaker_id'] = speaker_id result['utt_id'] = utt_id result['d_mel_spec'] = d_mel result['mel_spec'] = mel_spec result['lin_spec'] = lin_spec result['mfcc'] = mfcc result['f0'] = f0 result['audio'] = audio finish += 1 return result
def voice_com(self,v1): ''' v1: wav file path return True if speaker in database ''' if len(self.database.values()) == 0: print("Your data not in our database.") return False wav = preprocess_wav(Path(v1)) # ## method 1 # embed1 = self.encoder.embed_speaker(wav1) # embed2 = self.encoder.embed_speaker(wav2) # sims1 = np.inner(embed1,embed2) # bigger 0.85 ## method 2 embed = self.encoder.embed_utterance(wav) for dk in self.database.keys(): sims = embed @ self.database[dk] # bigger 0.75 if sims > 0.75: print("welcome {}!".format(dk)) return True print("Your data not in our database.") return False
def get_speaker_similarity_dict_and_wav_splits(file_name): print('Processing voices for file:', file_name) fpath = os.fspath(file_name) wav = preprocess_wav(fpath_or_wav=fpath) speaker_names = ['Phreak', 'Other'] segments = [[0, 25], [75, 90]] encoder = VoiceEncoder('cpu') speaker_wavs = [ wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in segments ] print( "Running the continuous embedding on cpu, this might take a while...") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) speaker_embeds = [ encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs ] similarity_dict = { name: cont_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds) } return similarity_dict, wav_splits
def load_speaker_embeds(args): encoder = VoiceEncoder() speakers_dir = '{0}/{1}/{2}/'.format(args.media, args.name, args.speakers) speaker_embeds_list = [] if os.path.exists(speakers_dir): speakers_dir_subfolders = [ f.path for f in os.scandir(speakers_dir) if f.is_dir() ] for speakers_dir_subfolder in speakers_dir_subfolders: speaker_embeds = [] wav_file_list = list( enumerate(glob.glob( "{}/*.wav".format(speakers_dir_subfolder)))) for index, wav_file in wav_file_list: wav = AudioSegment.from_wav(wav_file) librosa_npy = audiosegment_to_librosawav(wav) librosa_wav = preprocess_wav(librosa_npy) current_embed = encoder.embed_utterance(librosa_wav) speaker_embeds.append(current_embed) if len(speaker_embeds) > 0: dirname = os.path.basename(speakers_dir_subfolder) speaker_embeds_list.append(( dirname, speaker_embeds, )) return speaker_embeds_list
def speaker_diarization(**kwargs): if 'wav' in kwargs: wav = kwargs['wav'] elif 'filepath' in kwargs: wav = preprocess_wav(kwargs['filepath']) avg_embed, cont_embeds, wav_splits = VoiceEncoder().embed_utterance( wav, return_partials=True, rate=16) fig, ax = plt.subplots(figsize=(6, 6)) wav_seconds = len(wav) / hparams.sampling_rate timesteps = np.arange(0, wav_seconds, wav_seconds / len(cont_embeds)) if 'speaker_embed' in kwargs: # compare utterance embeddings with speaker embeddings similarity = cont_embeds @ kwargs['speaker_embed'] dummy_similarity = cont_embeds @ create_dummy_speaker() ax.plot(timesteps, similarity, 'g') ax.plot(timesteps, dummy_similarity, 'k--') else: # cluster utterance embeddings using Spectral Clustering spectral = SpectralClustering(n_clusters=2).fit_predict(cont_embeds) ax.plot(timesteps, spectral) plt.show()
def several_speakers_identification(self, path, min_duration=3, return_splits=False, export=False, recognition=False, language='en-En'): self.min_duration = min_duration self.path = path wav = preprocess_wav(path) sf.write(self.wav, wav, 16000, subtype='PCM_24') encoder = VoiceEncoder() _, embed, slices = encoder.embed_utterance(wav, return_partials=True, rate=1) np.set_printoptions(suppress=True) for i in range(len(embed)): self.add_speaker(embed[i]) # for i in range(len(self.timing)): # print(i, self.timing[i]) self.clear() print('Found %d speakers' % self.speakers_number) for i in range(self.speakers_number): print('Speaker ' + str(i) + ': ' + str(len(self.speakers[i])) + 's') self.splits = self.get_splits() if recognition or export: paths = ExportAudio.export(self.splits, self.wav) if recognition: self.recognize_audio(language, paths, export) if return_splits: return self.speakers_number, self.splits return self.speakers_number
def fingerprint_from_file(filepath, segment=None, sampling_rate=16000): fpath = Path(filepath) wav = preprocess_wav(fpath) if segment: wav = wav[int(segment[0] * sampling_rate):int(segment[1]) * sampling_rate] return VoiceEncoder().embed_utterance(wav)
def isolate_voice(audio_file_path: Path, embed_path: Path, params_path: Path, output_path: Path): """ load speaker embeds from pickle take voice out only if value is > thresh and take greater if both > thresh Args: file_path: input complete wav file path from which rick's voice will be taken out cutoff_thresh: voice if value above this is taken """ params = load_params(params_path) cutoff_threshold = params["cutoff_threshold"] sampling_rate = params["wav_bitrate"] print("preprocessing") file_wav = preprocess_wav(audio_file_path) ; print("input file shape ", file_wav.shape, "\n", file_wav[:10]) print("file preprocessed") encoder = VoiceEncoder("cpu") print("model loaded") speaker_names = ["Rick", "Morty"] _, file_embeds, wav_splits = encoder.embed_utterance(file_wav, return_partials=True, rate=1) print("file encoded") speaker_embeds = pickle.load(open(embed_path, "rb")) similarity_dict = {name: file_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds)} print("similatrity dict is\n", similarity_dict) pickle.dump(similarity_dict, open("./similarity.pkl", "wb")) #find greater in both then cutoff -> take that second append it to that file current_second = 0 rick_wav = [] rick_seconds = [] morty_wav = [] morty_seconds = [] for rick_value, morty_value in zip(similarity_dict["Rick"], similarity_dict["Morty"]): print(current_second, rick_value, morty_value) if rick_value > morty_value and rick_value > cutoff_threshold: rick_wav.append(file_wav[current_second * sampling_rate : (current_second+1) * sampling_rate]) rick_seconds.append(current_second) print("append rick") elif morty_value > rick_value and morty_value > cutoff_threshold: morty_wav.append(file_wav[current_second * sampling_rate: (current_second+1) * sampling_rate]) morty_seconds.append(current_second) print("append morty") else: print("skipping") current_second += 1 rick_wav = [item for sublist in rick_wav for item in sublist] morty_wav = [item for sublist in morty_wav for item in sublist] save_wav(np.array(rick_wav), output_path.joinpath("rick.wav"), sampling_rate) save_wav(np.array(morty_wav), output_path.joinpath("morty.wav"), sampling_rate) return rick_seconds, morty_seconds
def preprocess(self, f): """ Applies preprocessing operations to a waveform either on disk or in memory such that The waveform will be resampled to match the data hyperparameters. :param f: either a filepath to an audio file or the waveform as a numpy array of floats. """ return preprocess_wav(f)
def process(wav_fpath): wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) # Output denoised wave after removing the pauses write('DenoisedInputFiles/DenoisedSignal.wav', 16000, wav) return cont_embeds, wav_splits
def oneDictorIdentification(self, cSample, mainFile): print('[-i] --> Identify Dictor') avg1 = 0.0 avg2 = 0.0 fpath = Path(cSample) wav = preprocess_wav(fpath) encoder = VoiceEncoder() embed = encoder.embed_utterance(wav) np.set_printoptions(precision=3, suppress=True) embedNew = [] for i in embed: if i != 0.0: embedNew.append(i) for s in embedNew: avg1 = avg1 + s fpath = Path(mainFile) wav = preprocess_wav(fpath) encoder = VoiceEncoder() embed = encoder.embed_utterance(wav) np.set_printoptions(precision=3, suppress=True) embedNew2 = [] for i in embed: if i != 0.0: embedNew2.append(i) for s in embedNew2: avg2 = avg2 + s self.result = abs((avg2 / len(embedNew2)) - (avg1 / len(embedNew))) print(self.result) if (self.result < 0.002): print("Match!") # print("\033[33m\033[1m {}".format("Match!")) return 1 else: print("These are different voices") # print("\033[33m\033[1m {}".format("These are different voices")) return 0
def process(wav_fpath): wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder() _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) # Output denoised wave after removing the pauses write('Denoise/Denoise_commercial_mono.wav', 16000, wav) return cont_embeds, wav_splits
def calculate_score(model, data_dir, output_dir, target_dir, threshold_path, **kwargs): """Calculate score""" data_dir = Path(data_dir) target_dir = Path(target_dir) if output_dir is None: output_dir = data_dir else: output_dir = Path(output_dir) output_dir.parent.mkdir(parents=True, exist_ok=True) output_path = Path(output_dir) / "evaluation_score.txt" metadata_path = data_dir / "metadata.json" metadata = json.load(metadata_path.open()) thresholds = yaml.load(Path(threshold_path).open()) threshold = thresholds[metadata["target_corpus"]] n_accept = 0 for pair in tqdm(metadata["pairs"]): wav = preprocess_wav(data_dir / pair["converted"]) source_emb = model.embed_utterance(wav) targets = [target_dir / tgt_utt for tgt_utt in pair["tgt_utts"]] target_emb = model.embed_speaker( [preprocess_wav(target) for target in targets]) cosine_similarity = (np.inner(source_emb, target_emb) / np.linalg.norm(source_emb) / np.linalg.norm(target_emb)) if cosine_similarity > threshold: n_accept += 1 svar = n_accept / len(metadata["pairs"]) print(f"[INFO]: Speaker verification accept rate: {svar}") print( f"Speaker verification accept rate: {svar}", file=output_path.open("a"), )
def encoder(file_paths, vocoder): print('Number of files in batch: {}'.format(len(file_paths))) # processed_wavs = Parallel(n_jobs=-1)(delayed(preprocess_wav)(i) for i in tqdm(file_paths)) processed_wavs = [preprocess_wav(i) for i in tqdm(file_paths)] # encodings = Parallel(n_jobs=-1)(delayed(vocoder.embed_utterance)(i) for i in tqdm(processed_wavs)) encodings = [vocoder.embed_utterance(i) for i in tqdm(processed_wavs)] print('Creating embeddings') encodings = np.array(encodings) return encodings
def compute_embed(files, encoder): emb = [] files = random.sample(files, min(len(files), 20)) for f in files: wav = preprocess_wav(f) e = encoder.embed_utterance(wav) emb.append(e) emb = np.array(emb) emb = emb.mean(axis=0) return emb
def generate_voice_profile(self, data_path): embeds = [] os.chdir(data_path) for file in os.listdir('.'): fpath = Path(os.getcwd() + '\\' + file) wav = preprocess_wav(fpath) embed = self.encoder.embed_utterance(wav) embeds.append(embed) centroid = np.array(embeds).mean(axis=0) os.chdir(de) return centroid
def embed_speaker_librispeech(speaker_path: Path, hp: Map): """ Create an embedding of a speaker directory using `resemblyzer`. :param speaker_path: path to speaker directory :param hp: hyperparameters object :return: speaker embedding """ flacs = list(speaker_path.rglob("*.flac")) flacs = random.sample(flacs, hp.n_samples) flacs = [resemblyzer.preprocess_wav(flac) for flac in flacs] return voice_encoder.embed_speaker(flacs)
def add_data(self,v,name): ''' v: wav file path name : str ''' if name in self.database: print("person exist") return False wav = preprocess_wav(Path(v)) self.database[name] = self.encoder.embed_utterance(wav) return True
def main(): file_var1 = request.files["audio1"] file_var2 = request.files["audio2"] print(file_var1) print(file_var2) file_extension1 = file_var1.filename.split(".")[-1] file_extension2 = file_var2.filename.split(".")[-1] if not file_extension1 in ["mp3", "wav" ] and not file_extension2 in ["mp3", "wav"]: return {"Error": "Formato inválido. Use mp3 o wav."} file_name1 = "voice1.{}".format(file_extension1) file_name2 = "voice2.{}".format(file_extension2) wav_fpath1 = os.path.join(app.config['UPLOAD_FOLDER'], file_name1) wav_fpath2 = os.path.join(app.config['UPLOAD_FOLDER'], file_name2) file_var1.save(wav_fpath1) file_var2.save(wav_fpath2) del file_var1 del file_var2 wav1 = preprocess_wav(wav_fpath1) wav2 = preprocess_wav(wav_fpath2) os.remove(wav_fpath1) os.remove(wav_fpath2) speaker_wavs = [wav1, wav2] speaker_embeds = [ encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs ] result = speaker_embeds[0] @ speaker_embeds[1] return {"Exito": np.float64(result)}
def augment_data(): noise_names = [] for noise_path in os.listdir('./noise'): noise_name = noise_path.split('.')[0] noise = f'./noise/{noise_path}' for username in os.listdir('./data'): data_encode = [] for user_file in os.listdir(f'./data/{username}'): y, sr = librosa.load(f'./data/{username}/{user_file}', sr=16000) encoded_data = resemblyzer.preprocess_wav( f'./data/{username}/{user_file}') data_encode.append(encoded_data) for i in range(1): choice = i print(choice) if choice == 1: aug = pitch(mix_bg(y, noise), sr, 0.2) elif choice == 2: aug = speed(mix_bg(y, noise), 1.2) else: aug = mix_bg(y, noise) if not os.path.exists(f'./augmented_data/{username}'): os.mkdir(f'./augmented_data/{username}') librosa.output.write_wav( f'./augmented_data/{username}/{noise_name}_{i}.wav', aug, sr) encoded_data = resemblyzer.preprocess_wav( f'./augmented_data/{username}/{noise_name}_{i}.wav') data_encode.append(encoded_data) with open(f'data/{username}_encoded_wav.pickle', 'wb') as handle: pickle.dump(data_encode, handle, protocol=pickle.HIGHEST_PROTOCOL)
def extract(data_dirs, output_dir): """Extract embedding by resemblyzer.""" encoder = VoiceEncoder() data = {} for data_dir in tqdm(data_dirs, position=0): file_list = librosa.util.find_files(data_dir) for file_path in tqdm(file_list, position=1, leave=False): wav = preprocess_wav(file_path) embedding = encoder.embed_utterance(wav) wav_name = splitext(basename(file_path))[0] data[wav_name] = embedding joblib.dump(data, f"{output_dir}.pkl")
def voiceRun(frames): p = pyaudio.PyAudio() wf = wave.open('check.wav', 'wb') wf.setnchannels(1) wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(16000) wf.writeframes(b''.join(frames)) wf.writeframes(b''.join(frames)) wf.close() wav = preprocess_wav('check.wav') embed = encoder.embed_utterance(wav) embed = np.array(embed).reshape(-1, 1, 256) res1 = voiceModel.predict(embed) res1 = res1.flatten() return res1.tolist()
def get_name_id(args, encoder, speaker_embeds_list, audio_segment): segment_npy = audiosegment_to_librosawav(audio_segment) segment_wav = preprocess_wav(segment_npy) current_embed = encoder.embed_utterance(segment_wav) min_similarity = args.min_similarity name_id = '' for speaker_id, speaker_embeds in speaker_embeds_list: for speaker_embed in speaker_embeds: similarity = current_embed @ speaker_embed if similarity > min_similarity: min_similarity = similarity name_id = speaker_id return name_id
def get_spk_emb(audio_file_dir, segment_len=960000): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") resemblyzer_encoder = VoiceEncoder(device=device) wav = preprocess_wav(audio_file_dir) l = len(wav) // segment_len # segment_len = 16000 * 60 l = np.max([1, l]) all_embeds = [] for i in range(l): mean_embeds, cont_embeds, wav_splits = resemblyzer_encoder.embed_utterance( wav[segment_len * i:segment_len* (i + 1)], return_partials=True, rate=2) all_embeds.append(mean_embeds) all_embeds = np.array(all_embeds) mean_embed = np.mean(all_embeds, axis=0) return mean_embed, all_embeds
def get_d_vector(self, record, sample_rate): """Get d-vector feature from audio record. Args: :param record: Record object to get feature from. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: D-vector feature vector """ wav = preprocess_wav(record) embed = self.encoder.embed_utterance(wav) np.set_printoptions(precision=3, suppress=True) return embed