Esempio n. 1
0
    def postprocess(self, feats, curr_sample_rate):
        if self.sample_rate != curr_sample_rate:
            wav_tensor = feats.clone().detach()
            wav_tensor = Resample(curr_sample_rate, self.sample_rate)(wav_tensor)
            feats = wav_tensor.numpy()

        if feats.dim() == 2:
            feats = feats.mean(-1)

        assert feats.dim() == 1, feats.dim()

        if self.normalize:
            with torch.no_grad():
                feats = F.layer_norm(feats, feats.shape)
        return feats
Esempio n. 2
0
def process_utterance(in_dir, out_dir, spker, basename):
    wav_path = os.path.join(in_dir, 'wav48', spker, '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', spker,
                           '{}.TextGrid'.format(basename))

    if not os.path.exists(tg_path):
        return None

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))
    text = '{' + '}{'.join(
        phone) + '}'  # '{A}{B}{$}{C}', $ represents silent phones
    text = text.replace('{$}', ' ')  # '{A}{B} {C}'
    text = text.replace('}{', ' ')  # '{A B} {C}'

    if start >= end:
        return None

    # Read and trim wav files
    sr, wav = read(wav_path)
    wav = torch.tensor(wav.astype(np.float32))
    if sr != hp.sampling_rate:
        wav = Resample(orig_freq=sr, new_freq=hp.sampling_rate)(wav)
    wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)]

    # Compute fundamental frequency
    f0, _ = pw.dio(wav.numpy().astype(np.float64),
                   hp.sampling_rate,
                   frame_period=hp.hop_length / hp.sampling_rate * 1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(wav)
    mel_spectrogram = mel_spectrogram.cpu().numpy().astype(
        np.float32)[:, :sum(duration)]
    energy = energy.numpy().astype(np.float32)[:sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # if the shape is not right, you can check get_alignment function
    try:
        assert (f0.shape[0] == energy.shape[0] == mel_spectrogram.shape[1])
    except AssertionError as e:
        print("duration problem: {}".format(wav_path))
        return None

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration,
            allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename),
            energy,
            allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    try:
        return '|'.join([basename, text]), max(f0), min([
            f for f in f0 if f != 0
        ]), max(energy), min(energy), mel_spectrogram.shape[1]
    except:
        print(basename)
        return None