def main(): train_DB = read_DB_structure(c.TRAIN_DATAROOT_DIR) transform = transforms.Compose([ truncatedinputfromMFB(), totensor_DNN_input() ]) file_loader = read_MFB speaker_list = sorted(set(train_DB['speaker_id'])) spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)} batch_size = 128 Dvector_train_dataset = Dvector_Dataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx) Dvector_train_loader = torch.utils.data.DataLoader(dataset=Dvector_train_dataset, batch_size=batch_size, shuffle=False)
from python_speech_features import fbank import configure as c from DB_wav_reader import read_DB_structure import pickle def normalize_frames(m, Scale=True): if Scale: return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12) else: return (m - np.mean(m, axis=0)) # audiopath = c.TRAIN_WAV_DIR audiopath = '/home/tuan/Documents/Train-Test-Data/public-test' db = read_DB_structure(audiopath) feat_and_label = {} print(db["filename"]) for filename, label in zip(db["filename"], db["speaker_id"]): print(filename) print(label) audio, sr = librosa.load(filename, sr=c.SAMPLE_RATE, mono=True) filter_banks, energies = fbank(audio, samplerate=c.SAMPLE_RATE, nfilt=40, winlen=0.025) filter_banks = 20 * np.log10(np.maximum(filter_banks, 1e-5)) feature = normalize_frames(filter_banks, Scale=False) feat_and_label['feat'] = feature