コード例 #1
0
    def __getitem__(self, idx):

        speaker = self.speakers[idx]
        wav_files = glob.glob(speaker + '/*.WAV')
        shuffle(wav_files)
        wav_files = wav_files[0:self.utterance_number]

        mel_dbs = []
        for f in wav_files:
            _, mel_db, _ = mfccs_and_spec(f, wav_process=True)
            mel_dbs.append(mel_db)
        return torch.Tensor(mel_dbs)
コード例 #2
0
embedder_net = SpeechEmbedder()
embedder_net.load_state_dict(torch.load(model_path))

#speakers = ['付立', '刘子涵', '操镭', '李思琪', '李潇潇', '王润宇', '罗明双', '资礼波', '赵晴']
speakers = ['fuli', 'lzh', 'cl', 'lsq', 'lxx', 'wry', 'lms', 'zlb', 'zq']
enroll_wav_path = 'data_eng'
dict_spkid_embeddings = {}

enroll_nums = 5
for speaker in speakers:
    #print(speaker)
    speaker_wavs = glob.glob(os.path.join(enroll_wav_path, speaker, '*.wav'))
    length = len(speaker_wavs)
    print(speaker, speaker_wavs)
    speaker_embeddings = np.zeros((length, 256),dtype=float)
    for i in range(length):
    #for wav in speaker_wavs:
        embeddings = []
        _, mel_db, _ = mfccs_and_spec(speaker_wavs[i], wav_process=True)
        embeddings.append(mel_db)
        embeddings = torch.Tensor(embeddings)
        embeddings = torch.FloatTensor(np.transpose(embeddings, axes=(0,1,2)))
        embeddings = embedder_net(embeddings)
        embeddings = embeddings.cpu().detach().numpy()
        speaker_embeddings[i] = embeddings
    print(speaker_embeddings.shape)
    dict_spkid_embeddings[speaker] = speaker_embeddings

np.save('enroll_9_.npy', dict_spkid_embeddings)
        
コード例 #3
0
import time
import torch
import glob
import numpy as np
from torch.autograd import Variable
from sklearn.metrics.pairwise import cosine_similarity
from model import SpeechEmbedder
from utils import mfccs_and_spec

model_path = 'weights/epoch_3450_iteration_1487381_EER_0.05986112356185913.pth'
#data = './test_tisv/speaker2.npy'
data = 'data_change'

if (__name__=='__main__'):
    print('Load model successfully.')
    embedder_net = SpeechEmbedder()
    embedder_net.load_state_dict(torch.load(model_path))

    inputs_wavs = glob.glob(os.path.join(data, '*', '*.wav'))

    for wav in inputs_wavs:
        _, mel_db, _ = mfccs_and_spec(wav, wav_process=True)
        inputs = torch.Tensor(mel_db)

        print(inputs.shape)
        inputs = inputs.unsqueeze(0)
        inputs = torch.FloatTensor(np.transpose(inputs, axes=(0,1,2)))
        outputs = embedder_net(inputs)
        mel_db = outputs.detach().numpy()
        output_path = wav[:-4]+'.npy'
        np.save(output_path, mel_db)
コード例 #4
0
total = 0
score = -10
name = 'Who?'

#speakers = ['付立', '刘子涵', '操镭', '李思琪', '李潇潇', '王润宇', '罗明双', '资礼波', '赵晴']
speakers = ['fuli', 'lzh', 'cl', 'lsq', 'lxx', 'wry', 'lms', 'zlb', 'zq']
data = 'media'
for speaker in speakers:
    test_wavs = glob.glob(os.path.join(data, '{}_*.wav'.format(speaker)))
    #print(test_wavs)
    x = 0
    y = 0
    for test_wav in test_wavs:
        scores = {}
        test_mel = []
        _, mel_db_test, _ = mfccs_and_spec(test_wav, wav_process=True)
        test_mel.append(mel_db_test)
        test_inputs = torch.Tensor(test_mel)
        #  print(test_inputs.size())
        test_inputs = torch.FloatTensor(
            np.transpose(test_inputs, axes=(0, 1, 2)))
        #print(test_inputs.size())
        test_output = embedder_net(test_inputs).cpu()
        for speaker_name in enroll_centroid_embeddings.keys():
            #print(enroll_centroid_embeddings[speaker_name].shape)
            speaker_centroid_enroll = get_centroid(
                enroll_centroid_embeddings[speaker_name],
                len(enroll_centroid_embeddings[speaker_name]))
            #print(speaker_centroid_enroll.shape)
            score_speaker = cosine_similarity(
                speaker_centroid_enroll.reshape(1, -1),