def main():
    train_DB = read_DB_structure(c.TRAIN_DATAROOT_DIR)
    transform = transforms.Compose([
        truncatedinputfromMFB(),
        totensor_DNN_input()
    ])
    file_loader = read_MFB
    speaker_list = sorted(set(train_DB['speaker_id']))
    spk_to_idx = {spk: i for i, spk in enumerate(speaker_list)}
    batch_size = 128
    Dvector_train_dataset = Dvector_Dataset(DB=train_DB, loader=file_loader, transform=transform, spk_to_idx=spk_to_idx)
    Dvector_train_loader = torch.utils.data.DataLoader(dataset=Dvector_train_dataset,
                                                       batch_size=batch_size,
                                                       shuffle=False)
Exemple #2
0
from python_speech_features import fbank
import configure as c
from DB_wav_reader import read_DB_structure
import pickle


def normalize_frames(m, Scale=True):
    if Scale:
        return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12)
    else:
        return (m - np.mean(m, axis=0))


# audiopath = c.TRAIN_WAV_DIR
audiopath = '/home/tuan/Documents/Train-Test-Data/public-test'
db = read_DB_structure(audiopath)

feat_and_label = {}

print(db["filename"])
for filename, label in zip(db["filename"], db["speaker_id"]):
    print(filename)
    print(label)
    audio, sr = librosa.load(filename, sr=c.SAMPLE_RATE, mono=True)
    filter_banks, energies = fbank(audio,
                                   samplerate=c.SAMPLE_RATE,
                                   nfilt=40,
                                   winlen=0.025)
    filter_banks = 20 * np.log10(np.maximum(filter_banks, 1e-5))
    feature = normalize_frames(filter_banks, Scale=False)
    feat_and_label['feat'] = feature