def __getitem__(self, idx): speaker = self.speakers[idx] wav_files = glob.glob(speaker + '/*.WAV') shuffle(wav_files) wav_files = wav_files[0:self.utterance_number] mel_dbs = [] for f in wav_files: _, mel_db, _ = mfccs_and_spec(f, wav_process=True) mel_dbs.append(mel_db) return torch.Tensor(mel_dbs)
embedder_net = SpeechEmbedder() embedder_net.load_state_dict(torch.load(model_path)) #speakers = ['付立', '刘子涵', '操镭', '李思琪', '李潇潇', '王润宇', '罗明双', '资礼波', '赵晴'] speakers = ['fuli', 'lzh', 'cl', 'lsq', 'lxx', 'wry', 'lms', 'zlb', 'zq'] enroll_wav_path = 'data_eng' dict_spkid_embeddings = {} enroll_nums = 5 for speaker in speakers: #print(speaker) speaker_wavs = glob.glob(os.path.join(enroll_wav_path, speaker, '*.wav')) length = len(speaker_wavs) print(speaker, speaker_wavs) speaker_embeddings = np.zeros((length, 256),dtype=float) for i in range(length): #for wav in speaker_wavs: embeddings = [] _, mel_db, _ = mfccs_and_spec(speaker_wavs[i], wav_process=True) embeddings.append(mel_db) embeddings = torch.Tensor(embeddings) embeddings = torch.FloatTensor(np.transpose(embeddings, axes=(0,1,2))) embeddings = embedder_net(embeddings) embeddings = embeddings.cpu().detach().numpy() speaker_embeddings[i] = embeddings print(speaker_embeddings.shape) dict_spkid_embeddings[speaker] = speaker_embeddings np.save('enroll_9_.npy', dict_spkid_embeddings)
import time import torch import glob import numpy as np from torch.autograd import Variable from sklearn.metrics.pairwise import cosine_similarity from model import SpeechEmbedder from utils import mfccs_and_spec model_path = 'weights/epoch_3450_iteration_1487381_EER_0.05986112356185913.pth' #data = './test_tisv/speaker2.npy' data = 'data_change' if (__name__=='__main__'): print('Load model successfully.') embedder_net = SpeechEmbedder() embedder_net.load_state_dict(torch.load(model_path)) inputs_wavs = glob.glob(os.path.join(data, '*', '*.wav')) for wav in inputs_wavs: _, mel_db, _ = mfccs_and_spec(wav, wav_process=True) inputs = torch.Tensor(mel_db) print(inputs.shape) inputs = inputs.unsqueeze(0) inputs = torch.FloatTensor(np.transpose(inputs, axes=(0,1,2))) outputs = embedder_net(inputs) mel_db = outputs.detach().numpy() output_path = wav[:-4]+'.npy' np.save(output_path, mel_db)
total = 0 score = -10 name = 'Who?' #speakers = ['付立', '刘子涵', '操镭', '李思琪', '李潇潇', '王润宇', '罗明双', '资礼波', '赵晴'] speakers = ['fuli', 'lzh', 'cl', 'lsq', 'lxx', 'wry', 'lms', 'zlb', 'zq'] data = 'media' for speaker in speakers: test_wavs = glob.glob(os.path.join(data, '{}_*.wav'.format(speaker))) #print(test_wavs) x = 0 y = 0 for test_wav in test_wavs: scores = {} test_mel = [] _, mel_db_test, _ = mfccs_and_spec(test_wav, wav_process=True) test_mel.append(mel_db_test) test_inputs = torch.Tensor(test_mel) # print(test_inputs.size()) test_inputs = torch.FloatTensor( np.transpose(test_inputs, axes=(0, 1, 2))) #print(test_inputs.size()) test_output = embedder_net(test_inputs).cpu() for speaker_name in enroll_centroid_embeddings.keys(): #print(enroll_centroid_embeddings[speaker_name].shape) speaker_centroid_enroll = get_centroid( enroll_centroid_embeddings[speaker_name], len(enroll_centroid_embeddings[speaker_name])) #print(speaker_centroid_enroll.shape) score_speaker = cosine_similarity( speaker_centroid_enroll.reshape(1, -1),