Exemple #1
0
def test_labels_number_of_frames():
    # https://github.com/r9y9/nnmnkwii/issues/85
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "jp.hed"))
    labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab"))
    linguistic_features = fe.linguistic_features(
        labels, binary_dict, continuous_dict, add_frame_features=True)
    assert labels.num_frames() == linguistic_features.shape[0]
Exemple #2
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
Exemple #3
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.linguistic_features(
         labels, self.binary_dict, self.continuous_dict,
         add_frame_features=self.add_frame_features,
         subphone_features=self.subphone_features)
     if self.log_f0_conditioning:
         for idx in self.pitch_idx:
             features[:, idx] = interp1d(_midi_to_hz(features, idx, True), kind="slinear")
     return features.astype(np.float32)
Exemple #4
0
def test_correct_vuv_by_phone():
    wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav"
    lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab"

    binary_dict, numeric_dict = hts.load_question_set(
        Path(__file__).parent / "data" / "jp_test.hed"
    )

    labels = hts.load(lab_path)
    sr, wav = wavfile.read(wav_path)
    wav = wav.astype(np.float64)
    assert sr == 48000

    out_feats, stream_sizes = _extract_static_feats(wav, sr)
    has_dynamic_features = [False] * len(stream_sizes)
    pitch_idx = len(binary_dict) + 1

    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features="coarse_coding",
    )

    params = {
        "labels": labels,
        "acoustic_features": out_feats,
        "binary_dict": binary_dict,
        "numeric_dict": numeric_dict,
        "stream_sizes": stream_sizes,
        "has_dynamic_features": has_dynamic_features,
        "pitch_idx": pitch_idx,
        "relative_f0": False,
        "frame_period": 5,
    }

    out_vuv_idx = 61
    vuv = out_feats[:, out_vuv_idx : out_vuv_idx + 1]

    vuv_corrected = correct_vuv_by_phone(vuv, binary_dict, linguistic_features)
    # by correcting VUV should make a difference
    _, _, vuv_fixed, _ = gen_spsvs_static_features(**{**params, "force_fix_vuv": True})
    assert np.any(vuv_corrected != vuv)

    # 0: Rest 1: Voiced 2: Unvoiced
    rest_idx = 0
    voiced_idx = 1
    unvoiced_idx = 2
    assert np.all(vuv_corrected[linguistic_features[:, rest_idx] > 0] < 0.5)
    assert np.all(vuv_corrected[linguistic_features[:, voiced_idx] > 0] > 0.5)
    assert np.all(vuv_corrected[linguistic_features[:, unvoiced_idx] > 0] < 0.5)
Exemple #5
0
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-30, 30]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)
    y = timelag_model(x, [x.shape[1]]).squeeze(0).cpu()

    # De-normalization and rounding
    lag = np.round(timelag_out_scaler.inverse_transform(y.data.numpy()))

    # Clip to the allowed range
    lag = np.clip(lag, allowed_range[0], allowed_range[1])

    # frames -> 100 ns
    lag *= 50000

    return lag
Exemple #6
0
    def collect_features(self, path):
        labels = hts.load(path)
        features = fe.linguistic_features(
            labels, self.binary_dict, self.continuous_dict,
            add_frame_features=self.add_frame_features,
            subphone_features=self.subphone_features)
        if self.add_frame_features:
            indices = labels.silence_frame_indices().astype(np.int)
        else:
            indices = labels.silence_phone_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Exemple #7
0
def tts_from_label(models,
                   label_path,
                   X_min,
                   X_max,
                   Y_mean,
                   Y_std,
                   post_filter=False,
                   apply_duration_model=True,
                   fs=16000):
    duration_model, acoustic_model = models["duration"], models["acoustic"]

    # Predict durations
    if apply_duration_model:
        duration_modified_hts_labels = gen_duration(label_path, duration_model,
                                                    X_min, X_max, Y_mean,
                                                    Y_std)
    else:
        duration_modified_hts_labels = hts.load(label_path)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=hp_acoustic.add_frame_features,
        subphone_features=hp_acoustic.subphone_features)
    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = P.minmax_scale(linguistic_features,
                                         X_min[ty],
                                         X_max[ty],
                                         feature_range=(0.01, 0.99))

    # Predict acoustic features
    acoustic_model = acoustic_model.cpu()
    acoustic_model.eval()
    x = Variable(torch.from_numpy(linguistic_features)).float()
    try:
        acoustic_predicted = acoustic_model(x).data.numpy()
    except:
        xl = len(x)
        x = x.view(1, -1, x.size(-1))
        acoustic_predicted = acoustic_model(x, [xl]).data.numpy()
        acoustic_predicted = acoustic_predicted.reshape(
            -1, acoustic_predicted.shape[-1])

    return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter, fs=fs)
Exemple #8
0
def lab2wav(args,
            device,
            label_path,
            binary_dict,
            continuous_dict,
            X_min,
            X_max,
            Y_mean,
            Y_var,
            Y_scale,
            duration_model,
            acoustic_model,
            post_filter=False):
    # Predict durations
    duration_modified_hts_labels = gen_duration(device, label_path,
                                                binary_dict, continuous_dict,
                                                X_min, X_max, Y_mean, Y_scale,
                                                duration_model)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features="full"
        if args.label == 'state_align' else "coarse_coding")

    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = minmax_scale(linguistic_features,
                                       X_min[ty],
                                       X_max[ty],
                                       feature_range=(0.01, 0.99))

    # Predict acoustic features
    # acoustic_model = acoustic_model.cpu()
    acoustic_model.eval()
    x = torch.FloatTensor(linguistic_features)
    acoustic_predicted = acoustic_model(x.unsqueeze(0)).data.numpy()
    print("acoustic_predicted shape: {}".format(acoustic_predicted.shape))

    # Apply denormalization
    acoustic_predicted = acoustic_predicted * Y_scale[ty] + Y_mean[ty]

    return gen_waveform(acoustic_predicted.squeeze(0), Y_var, post_filter)
Exemple #9
0
def test_phone_alignment_label():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    input_state_label = join(DATA_DIR, "label_phone_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=False,
                               subphone_features=None)
    assert not labels.is_state_alignment_label()
    assert np.all(np.isfinite(x))

    for subphone_features in ["coarse_coding", "minimal_phoneme"]:
        x = fe.linguistic_features(labels,
                                   binary_dict,
                                   continuous_dict,
                                   add_frame_features=True,
                                   subphone_features=subphone_features)
        assert np.all(np.isfinite(x))

    x = fe.duration_features(labels)
    assert np.all(np.isfinite(x))
Exemple #10
0
def predict_duration(device,
                     labels,
                     duration_model,
                     duration_in_scaler,
                     duration_out_scaler,
                     lag,
                     binary_dict,
                     continuous_dict,
                     pitch_indices=None,
                     log_f0_conditioning=True):

    # Get note indices
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                duration_linguistic_features, idx, log_f0_conditioning),
                                                            kind="slinear")

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features)

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))
    pred_durations = duration_model(
        x, [x.shape[1]]).squeeze(0).cpu().data.numpy()

    # Apply denormalization
    pred_durations = duration_out_scaler.inverse_transform(pred_durations)
    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
Exemple #11
0
def test_linguistic_features_for_acoustic_model():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    # Linguistic features
    # To train acoustic model paired with linguistic features,
    # we need frame-level linguistic feature representation.
    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    assert labels.is_state_alignment_label()
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=True,
                               subphone_features="full")
    y = np.fromfile(join(DATA_DIR, "binary_label_425", "arctic_a0001.lab"),
                    dtype=np.float32).reshape(-1, x.shape[-1])
    assert np.allclose(x, y)
Exemple #12
0
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std,
                   post_filter=False,
                   apply_duration_model=True, coef=1.4, fs=16000,
                   mge_training=True):
    duration_model, acoustic_model = models["duration"], models["acoustic"]

    if use_cuda:
        duration_model = duration_model.cuda()
        acoustic_model = acoustic_model.cuda()

    # Predict durations
    if apply_duration_model:
        duration_modified_hts_labels = gen_duration(
            label_path, duration_model, X_min, X_max, Y_mean, Y_std)
    else:
        duration_modified_hts_labels = hts.load(label_path)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict, continuous_dict,
        add_frame_features=hp_acoustic.add_frame_features,
        subphone_features=hp_acoustic.subphone_features)
    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = P.minmax_scale(
        linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99))

    # Predict acoustic features
    acoustic_model.eval()
    x = Variable(torch.from_numpy(linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy()
    acoustic_predicted = acoustic_predicted.reshape(-1, acoustic_predicted.shape[-1])

    return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter,
                        coef=coef, fs=fs, mge_training=mge_training)
Exemple #13
0
def predict_acoustic(device,
                     labels,
                     acoustic_model,
                     acoustic_in_scaler,
                     acoustic_out_scaler,
                     binary_dict,
                     continuous_dict,
                     subphone_features="coarse_coding",
                     pitch_indices=None,
                     log_f0_conditioning=True):

    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features=subphone_features)

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(_midi_to_hz(
                linguistic_features, idx, log_f0_conditioning),
                                                   kind="slinear")

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range
        linguistic_features = np.clip(linguistic_features,
                                      acoustic_in_scaler.feature_range[0],
                                      acoustic_in_scaler.feature_range[1])

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))
    pred_acoustic = acoustic_model(x,
                                   [x.shape[1]]).squeeze(0).cpu().data.numpy()

    # Apply denormalization
    pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)

    return pred_acoustic
Exemple #14
0
    def get_acoustic_parameter(self, label):
        self.acoustic_model.eval()
        self.acoustic_model.to(self.device)
        sil_index = label.silence_frame_indices()
        subphone_feat = self.config.subphone_feature

        input_ = linguistic_features(label,
                                     self.bin_dict,
                                     self.con_dict,
                                     add_frame_features=True,
                                     subphone_features=subphone_feat)
        input_ = np.delete(input_, sil_index, axis=0)
        input_ = self._get_x_scaled(self.acoustic_dataset, input_)

        predicted = self.get_predicted(self.acoustic_model, input_)
        predicted = self._get_t_scaled(self.acoustic_dataset, predicted)
        predicted = predicted.reshape(-1, predicted.shape[-1])

        return predicted
Exemple #15
0
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std):
    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=hp_duration.add_frame_features,
        subphone_features=hp_duration.subphone_features).astype(np.float32)

    # Apply normali--post-filterzation
    ty = "duration"
    duration_linguistic_features = P.minmax_scale(duration_linguistic_features,
                                                  X_min[ty],
                                                  X_max[ty],
                                                  feature_range=(0.01, 0.99))

    # Apply models
    duration_model = duration_model.cpu()
    duration_model.eval()

    #  Apply model
    x = Variable(torch.from_numpy(duration_linguistic_features)).float()
    try:
        duration_predicted = duration_model(x).data.numpy()
    except:
        xl = len(x)
        x = x.view(1, -1, x.size(-1))
        duration_predicted = duration_model(x, [xl]).data.numpy()
        duration_predicted = duration_predicted.reshape(
            -1, duration_predicted.shape[-1])

    # Apply denormalization
    duration_predicted = duration_predicted * Y_std[ty] + Y_mean[ty]
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    #  print(duration_predicted)
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
    def gen_parameters(self, utt_id, labels):
        feature = fe.linguistic_features(
            labels, self.binary_dict, self.continuous_dict,
            add_frame_features=True,
            subphone_features='coarse_coding').astype(np.float32)

        # normalize
        feature = scaler['X']['acoustic'].transform(feature)

        # add speaker information
        feature = self.add_speaker_code(utt_id, feature)

        # predict acoustic features
        feature = torch.from_numpy(feature).to(device)
        pred = self.acoustic_model.predict(feature)
        pred_mean = pred['mean'].data.cpu().numpy()
        pred_var = pred['var'].data.cpu().numpy()

        # denormalize
        scale = self.scaler['Y']['acoustic'].scale_
        pred_mean = self.scaler['Y']['acoustic'].inverse_transform(pred_mean)
        pred_var *= scale ** 2

        # split acoustic features
        mgc = pred_mean[:, :self.lf0_start_idx]
        lf0 = pred_mean[:, self.lf0_start_idx:self.vuv_start_idx]
        vuv = pred_mean[:, self.vuv_start_idx]
        bap = pred_mean[:, self.bap_start_idx:]

        # make variances for Maximum Likelihood Parameter Generation (MLPG)
        mgc_variances = pred_var[:, :self.lf0_start_idx]
        lf0_variances = pred_var[:, self.lf0_start_idx:self.vuv_start_idx]
        bap_variances = pred_var[:, self.bap_start_idx:]

        # perform MLPG to calculate static features
        mgc = mlpg(mgc, mgc_variances, self.windows)
        lf0 = mlpg(lf0, lf0_variances, self.windows)
        bap = mlpg(bap, bap_variances, self.windows)

        feature = np.hstack([mgc, lf0, vuv.reshape(-1, 1), bap])

        return feature
Exemple #17
0
def test_singing_voice_question():
    # Test SVS case
    """
QS "L-Phone_Yuusei_Boin"           {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*}
CQS "e1" {/E:(\\NOTE)]}
    """
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "test_jp_svs.hed"), append_hat_for_LL=False)
    input_phone_label = join(DATA_DIR, "song070_f00001_063.lab")
    labels = hts.load(input_phone_label)
    feats = fe.linguistic_features(labels, binary_dict, continuous_dict)
    assert feats.shape == (74, 2)

    # CQS e1: get the current MIDI number
    C_e1 = continuous_dict[0]
    for idx, lab in enumerate(labels):
        context = lab[-1]
        if C_e1.search(context) is not None:
            from nnmnkwii.frontend import NOTE_MAPPING
            assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1]
Exemple #18
0
    def get_duration_label(self, path):
        label = hts.load(path)
        self.duration_model.eval()

        feat = linguistic_features(label,
                                   self.bin_dict,
                                   self.con_dict,
                                   add_frame_features=False,
                                   subphone_features=None)
        feat = feat.astype(np.float32)
        feat = self._get_x_scaled(self.duration_dataset, feat)
        self.duration_model.to(self.device)

        predicted = self.get_predicted(self.duration_model, feat)
        predicted = self._get_t_scaled(self.duration_dataset, predicted)
        predicted = np.round(predicted)
        predicted[predicted <= 0] = 1
        label.set_durations(predicted)

        return label
Exemple #19
0
    def collect_features(self, wav_path, label_path):
        d,fs=librosa.load(wav_path,sr=sample_rate)
        #audio, _ = librosa.effects.trim(
        #    audio,top_db=config["trim_threshold_in_db"],frame_length=config["trim_frame_size"],hop_length=config["trim_hop_size"] )
        D = librosa.stft( d, n_fft=fft_len, hop_length=hop_size, win_length=None,window=window,   pad_mode="reflect"  )
        S, _ = librosa.magphase(D)
        mel_basis = librosa.filters.mel(sr=fs,n_fft=fft_len,n_mels=mel_dim, fmin=fmin, fmax=fmax )
        #mel_basis=librosa.effects.feature.melspectrogram(d,sr=fs,n_fft=fft_len,hop_length=hop_size,n_mels=mel_dim,fmin=0,htk=True)
        mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T
        #features=features[None,:,:]

        _f0, t = pyworld.dio(d.astype(np.double), fs=sample_rate, f0_ceil=fmax, frame_period=frame_period )
        f0 = pyworld.stonemask(d.astype(np.double), _f0, t, sample_rate)

        # extract energy
        labels = _hts.load(label_path)
        features = fe.linguistic_features(labels, self.binary_dict, self.continuous_dict,add_frame_features=True,subphone_features='coarse_coding',frame_shift_in_micro_sec=frame_shift_in_micro_sec)
        num_frames=labels.num_frames(frame_shift_in_micro_sec=frame_shift_in_micro_sec)
        indices = labels.silence_frame_indices(frame_shift_in_micro_sec=frame_shift_in_micro_sec)
        #print(fs, wav_path, mel.shape[0],labels.num_frames())
        mel = mel[:num_frames]
        if len(f0) >= len(mel):
            f0 = f0[: len(mel)]
        else:
            f0 = np.pad(f0, (0, len(mel) - len(f0)))
        energy = np.sqrt(np.sum(S ** 2, axis=0))
        energy=energy[: len(mel)]

        assert (len(mel) == len(f0) == len(energy)),"error:%s,%s,%s,%s" %(wav_path,len(mel), len(f0), len(energy))

        f0 = remove_outlier(f0)
        energy = remove_outlier(energy)

        if len(indices)>0:
            features = np.delete(features, indices, axis=0)
            mel = np.delete(mel, indices, axis=0)
            f0=np.delete(f0,indices,axis=0)
            energy = np.delete(energy, indices, axis=0)
        #print(features.shape) #
        print(wav_path, mel.shape[0],f0.shape[0], energy.shape[0],features.shape[0],num_frames,len(indices))
        return mel,f0,energy,features
Exemple #20
0
def test_silence_frame_removal_given_hts_labels():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    features = fe.linguistic_features(labels,
                                      binary_dict,
                                      continuous_dict,
                                      add_frame_features=True,
                                      subphone_features="full")

    # Remove silence frames
    indices = labels.silence_frame_indices()
    features = np.delete(features, indices, axis=0)

    y = np.fromfile(join(DATA_DIR, "nn_no_silence_lab_425",
                         "arctic_a0001.lab"),
                    dtype=np.float32).reshape(-1, features.shape[-1])
    assert features.shape == y.shape
    assert np.allclose(features, y)
Exemple #21
0
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std):
    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict, continuous_dict,
        add_frame_features=hp_duration.add_frame_features,
        subphone_features=hp_duration.subphone_features).astype(np.float32)

    # Apply normali--post-filterzation
    ty = "duration"
    duration_linguistic_features = P.minmax_scale(
        duration_linguistic_features,
        X_min[ty], X_max[ty], feature_range=(0.01, 0.99))

    # Apply models
    duration_model.eval()

    #  Apply model
    x = Variable(torch.from_numpy(duration_linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    duration_predicted = duration_model(x, [xl]).data.cpu().numpy()
    duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1])

    # Apply denormalization
    duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty])
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    #  print(duration_predicted)
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
Exemple #22
0
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_config,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-20, 20],
                    allowed_range_rest=[-40, 40]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)
    if isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range
        timelag_linguistic_features = np.clip(
            timelag_linguistic_features, timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1])

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * timelag_out_scaler.var_
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_timelag = timelag_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag, timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(pred_timelag[idx],
                                        allowed_range_rest[0],
                                        allowed_range_rest[1])
        else:
            pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0],
                                        allowed_range[1])

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
def gen_waveform(labels,
                 acoustic_features,
                 binary_dict,
                 continuous_dict,
                 stream_sizes,
                 has_dynamic_features,
                 subphone_features="coarse_coding",
                 log_f0_conditioning=True,
                 pitch_idx=None,
                 num_windows=3,
                 post_filter=True,
                 sample_rate=48000,
                 frame_period=5,
                 relative_f0=True):
    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(stream_sizes,
                                                      has_dynamic_features,
                                                      len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features,
                                             static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                               sample_rate, fftlen)

    # fill aperiodicity with ones for unvoiced regions
    aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0
    # WORLD fails catastrophically for out of range aperiodicity
    aperiodicity = np.clip(aperiodicity, 0.0, 1.0)

    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(
            labels,
            binary_dict,
            continuous_dict,
            add_frame_features=True,
            subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    # 音量を小さくする(音割れ防止)
    # TODO: ここのかける定数をいい感じにする
    spectrogram *= 0.000000001
    sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60)

    return f0, sp, bap, generated_waveform
Exemple #24
0
def gen_spsvs_static_features(
    labels,
    acoustic_features,
    binary_dict,
    numeric_dict,
    stream_sizes,
    has_dynamic_features,
    subphone_features="coarse_coding",
    pitch_idx=None,
    num_windows=3,
    frame_period=5,
    relative_f0=True,
    vibrato_scale=1.0,
    vuv_threshold=0.3,
    force_fix_vuv=True,
):
    """Generate static features from predicted acoustic features

    Args:
        labels (HTSLabelFile): HTS labels
        acoustic_features (ndarray): predicted acoustic features
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        stream_sizes (list): stream sizes
        has_dynamic_features (list): whether each stream has dynamic features
        subphone_features (str): subphone feature type
        pitch_idx (int): index of pitch features
        num_windows (int): number of windows
        frame_period (float): frame period
        relative_f0 (bool): whether to use relative f0
        vibrato_scale (float): vibrato scale
        vuv_threshold (float): vuv threshold
        force_fix_vuv (bool): whether to use post-processing to fix VUV.

    Returns:
        tuple: tuple of mgc, lf0, vuv and bap.
    """
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, num_windows
        )
    else:
        static_stream_sizes = stream_sizes

    # Copy here to avoid inplace operations on input acoustic features
    acoustic_features = acoustic_features.copy()

    # Split multi-stream features
    streams = split_streams(acoustic_features, static_stream_sizes)

    if len(streams) == 4:
        mgc, target_f0, vuv, bap = streams
        vib, vib_flags = None, None
    elif len(streams) == 5:
        # Assuming diff-based vibrato parameters
        mgc, target_f0, vuv, bap, vib = streams
        vib_flags = None
    elif len(streams) == 6:
        # Assuming sine-based vibrato parameters
        mgc, target_f0, vuv, bap, vib, vib_flags = streams
    else:
        raise RuntimeError("Not supported streams")

    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features=subphone_features,
    )

    # Correct V/UV based on special phone flags
    if force_fix_vuv:
        vuv = correct_vuv_by_phone(vuv, binary_dict, linguistic_features)

    # F0
    if relative_f0:
        diff_lf0 = target_f0
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < vuv_threshold] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < vuv_threshold] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    if vib is not None:
        if vib_flags is not None:
            # Generate sine-based vibrato
            vib_flags = vib_flags.flatten()
            m_a, m_f = vib[:, 0], vib[:, 1]

            # Fill zeros for non-vibrato frames
            m_a[vib_flags < 0.5] = 0
            m_f[vib_flags < 0.5] = 0

            # Gen vibrato
            sr_f0 = int(1 / (frame_period * 0.001))
            f0 = gen_sine_vibrato(f0.flatten(), sr_f0, m_a, m_f, vibrato_scale)
        else:
            # Generate diff-based vibrato
            f0 = f0.flatten() + vibrato_scale * vib.flatten()

    # NOTE: Back to log-domain for convenience
    lf0 = f0.copy()
    lf0[np.nonzero(lf0)] = np.log(f0[np.nonzero(lf0)])
    # NOTE: interpolation is necessary
    lf0 = interp1d(lf0, kind="slinear")

    lf0 = lf0[:, None] if len(lf0.shape) == 1 else lf0
    vuv = vuv[:, None] if len(vuv.shape) == 1 else vuv

    return mgc, lf0, vuv, bap
Exemple #25
0
    def collect_features(self, wav_path, label_path):
        labels = hts.load(label_path)
        l_features = fe.linguistic_features(
            labels,
            self.binary_dict,
            self.continuous_dict,
            add_frame_features=True,
            subphone_features="coarse_coding",
        )

        f0_score = _midi_to_hz(l_features, self.pitch_idx, False)
        notes = l_features[:, self.pitch_idx]
        notes = notes[notes > 0]

        # allow 200 cent upper/lower to properly handle F0 estimation of
        # preparation, vibrato and overshoot.
        # NOET: set the minimum f0 to 63.5 Hz (125 - 3*20.5)
        # https://acoustics.jp/qanda/answer/50.html
        # NOTE: sinsy allows 30-150 cent frequency range for vibrato (as of 2010)
        # https://staff.aist.go.jp/m.goto/PAPER/SIGMUS201007oura.pdf
        min_f0 = max(63.5, librosa.midi_to_hz(min(notes) - 2))
        max_f0 = librosa.midi_to_hz(max(notes) + 2)
        assert max_f0 > min_f0

        # Workaround segfault issues of WORLD's CheapTrick
        min_f0 = min(min_f0, 500)

        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if fs != self.sample_rate:
            raise RuntimeError(
                "Sample rate mismatch! {} != {}".format(fs, self.sample_rate)
            )

        if self.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0
            )
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0
            )
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)

        # Workaround for https://github.com/r9y9/nnsvs/issues/7
        f0 = np.maximum(f0, 0)

        # Correct V/UV (and F0) based on the musical score information
        # treat frames where musical notes are not assigned as unvoiced
        if self.correct_vuv:
            # Use smoothed mask so that we don't mask out overshoot or something
            # that could happen at the start/end of notes
            # 0.5 sec. window (could be tuned for better results)
            win_length = int(0.5 / (self.frame_period * 0.001))
            mask = np.convolve(f0_score, np.ones(win_length) / win_length, "same")
            if len(f0) > len(mask):
                mask = np.pad(mask, (0, len(f0) - len(mask)), "constant")
            elif len(f0) < len(mask):
                mask = mask[: len(f0)]
            f0 = f0 * np.sign(mask)

        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold)

        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if self.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)

        # F0 -> continuous F0
        lf0 = interp1d(lf0, kind="slinear")

        # Vibrato parameter extraction
        sr_f0 = int(1 / (self.frame_period * 0.001))
        if self.vibrato_mode == "sine":
            win_length = 64
            n_fft = 256
            threshold = 0.12

            if self.use_harvest:
                # NOTE: harvest is not supported here since the current implemented algorithm
                # relies on v/uv flags to find vibrato sections.
                # We use DIO since it provides more accurate v/uv detection in my experience.
                _f0, _timeaxis = pyworld.dio(
                    x,
                    fs,
                    frame_period=self.frame_period,
                    f0_floor=min_f0,
                    f0_ceil=max_f0,
                )
                _f0 = pyworld.stonemask(x, _f0, _timeaxis, fs)
                f0_smooth = extract_smoothed_f0(_f0, sr_f0, cutoff=8)
            else:
                f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=8)

            f0_smooth_cent = hz_to_cent_based_c4(f0_smooth)
            vibrato_likelihood = extract_vibrato_likelihood(
                f0_smooth_cent, sr_f0, win_length=win_length, n_fft=n_fft
            )
            vib_flags, m_a, m_f = extract_vibrato_parameters(
                f0_smooth_cent, vibrato_likelihood, sr_f0, threshold=threshold
            )
            m_a = interp1d(m_a, kind="linear")
            m_f = interp1d(m_f, kind="linear")
            vib = np.stack([m_a, m_f], axis=1)
            vib_flags = vib_flags[:, np.newaxis]
        elif self.vibrato_mode == "diff":
            # NOTE: vibrato is known to have 3 ~ 8 Hz range (in general)
            # remove higher frequency than 3 to separate vibrato from the original F0
            f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=3)
            vib = (f0 - f0_smooth)[:, np.newaxis]
            vib_flags = None
        elif self.vibrato_mode == "none":
            vib, vib_flags = None, None
        else:
            raise RuntimeError("Unknown vibrato mode: {}".format(self.vibrato_mode))

        mgc = pysptk.sp2mc(
            spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)
        )

        # Post-processing for aperiodicy
        # ref: https://github.com/MTG/WGANSing/blob/mtg/vocoder.py
        if self.interp_unvoiced_aperiodicity:
            is_voiced = (vuv > 0).reshape(-1)
            if not np.any(is_voiced):
                pass  # all unvoiced, do nothing
            else:
                for k in range(aperiodicity.shape[1]):
                    aperiodicity[~is_voiced, k] = np.interp(
                        np.where(~is_voiced)[0],
                        np.where(is_voiced)[0],
                        aperiodicity[is_voiced, k],
                    )
        bap = pyworld.code_aperiodicity(aperiodicity, fs)

        # Parameter trajectory smoothing
        if self.trajectory_smoothing:
            modfs = int(1 / 0.005)
            for d in range(mgc.shape[1]):
                mgc[:, d] = lowpass_filter(
                    mgc[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff
                )
            for d in range(bap.shape[1]):
                bap[:, d] = lowpass_filter(
                    bap[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff
                )

        # Adjust lengths
        mgc = mgc[: labels.num_frames()]
        lf0 = lf0[: labels.num_frames()]
        vuv = vuv[: labels.num_frames()]
        bap = bap[: labels.num_frames()]
        vib = vib[: labels.num_frames()] if vib is not None else None
        vib_flags = vib_flags[: labels.num_frames()] if vib_flags is not None else None

        if self.relative_f0:
            # # F0 derived from the musical score
            f0_score = f0_score[:, None]
            if len(f0_score) > len(f0):
                print(
                    "Warning! likely to have mistakes in alignment in {}".format(
                        label_path
                    )
                )
                print(f0_score.shape, f0.shape)
                f0_score = f0_score[: len(f0)]

            lf0_score = f0_score.copy()
            nonzero_indices = np.nonzero(f0_score)
            lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
            lf0_score = interp1d(lf0_score, kind="slinear")
            # relative f0
            diff_lf0 = lf0 - lf0_score
            diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0))

            f0_target = diff_lf0
        else:
            f0_target = lf0

        mgc = apply_delta_windows(mgc, self.windows)
        f0_target = apply_delta_windows(f0_target, self.windows)
        bap = apply_delta_windows(bap, self.windows)
        vib = apply_delta_windows(vib, self.windows) if vib is not None else None

        if vib is None and vib_flags is None:
            features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32)
        elif vib is not None and vib_flags is None:
            features = np.hstack((mgc, f0_target, vuv, bap, vib)).astype(np.float32)
        elif vib is not None and vib_flags is not None:
            features = np.hstack((mgc, f0_target, vuv, bap, vib, vib_flags)).astype(
                np.float32
            )
        else:
            raise RuntimeError("Unknown combination of features")

        # Align waveform and features
        wave = x.astype(np.float32) / 2 ** 15
        T = int(features.shape[0] * (fs * self.frame_period / 1000))
        if len(wave) < T:
            if T - len(wave) > int(fs * 0.005):
                print("Warn!!", T, len(wave), T - len(wave))
                print("you have unepxcted input. Please debug though ipdb")
                import ipdb

                ipdb.set_trace()
            else:
                pass
            wave = np.pad(wave, (0, T - len(wave)))
        assert wave.shape[0] >= T
        wave = wave[:T]

        return features, wave
Exemple #26
0
def predict_acoustic(
    device,
    labels,
    acoustic_model,
    acoustic_config,
    acoustic_in_scaler,
    acoustic_out_scaler,
    binary_dict,
    numeric_dict,
    subphone_features="coarse_coding",
    pitch_indices=None,
    log_f0_conditioning=True,
    force_clip_input_features=False,
):
    """Predict acoustic features from HTS labels

    MLPG is applied to the predicted features if the output features have
    dynamic features.

    Args:
        device (torch.device): device to use
        labels (HTSLabelFile): HTS labels
        acoustic_model (nn.Module): acoustic model
        acoustic_config (AcousticConfig): acoustic configuration
        acoustic_in_scaler (sklearn.preprocessing.StandardScaler): input scaler
        acoustic_out_scaler (sklearn.preprocessing.StandardScaler): output scaler
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        subphone_features (str): subphone feature type
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to use log f0 conditioning
        force_clip_input_features (bool): whether to force clip input features

    Returns:
        ndarray: predicted acoustic features
    """
    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features=subphone_features,
    )

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(
                _midi_to_hz(linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if force_clip_input_features and isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        linguistic_features[:, non_pitch_indices] = np.clip(
            linguistic_features[:, non_pitch_indices],
            acoustic_in_scaler.feature_range[0],
            acoustic_in_scaler.feature_range[1],
        )

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = acoustic_model.inference(x, [x.shape[1]])
        if np.any(acoustic_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (
                max_sigma.squeeze(0).cpu().data.numpy() ** 2 * acoustic_out_scaler.var_
            )
            max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
            max_mu = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )

            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features,
            )
        else:
            # Apply denormalization
            pred_acoustic = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
    else:
        # (T, D_out)
        pred_acoustic = (
            acoustic_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
        if np.any(acoustic_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                pred_acoustic,
                acoustic_out_scaler.var_,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features,
            )

    return pred_acoustic
Exemple #27
0
def predict_timelag(
    device,
    labels,
    timelag_model,
    timelag_config,
    timelag_in_scaler,
    timelag_out_scaler,
    binary_dict,
    numeric_dict,
    pitch_indices=None,
    log_f0_conditioning=True,
    allowed_range=None,
    allowed_range_rest=None,
    force_clip_input_features=False,
):
    """Predict time-lag from HTS labels

    Args:
        device (torch.device): device
        labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels
        timelag_model (nn.Module): time-lag model
        timelag_config (dict): time-lag model config
        timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler
        timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler
        binary_dict (dict): binary feature dict
        numeric_dict (dict): numeric feature dict
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to condition on log f0
        allowed_range (list): allowed range of time-lag
        allowed_range_rest (list): allowed range of time-lag for rest
        force_clip_input_features (bool): whether to clip input features

    Returns;
        ndarray: time-lag predictions
    """
    if allowed_range is None:
        allowed_range = [-20, 20]
    if allowed_range_rest is None:
        allowed_range_rest = [-40, 40]
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        numeric_dict,
        add_frame_features=False,
        subphone_features=None,
    ).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features
    )
    if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(timelag_linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        timelag_linguistic_features[:, non_pitch_indices] = np.clip(
            timelag_linguistic_features[:, non_pitch_indices],
            timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1],
        )

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (
                max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_
            )
            max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )
        else:
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
    else:
        # (T, D_out)
        pred_timelag = (
            timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag,
                timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]
            )
        else:
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range[0], allowed_range[1]
            )

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
Exemple #28
0
def predict_duration(device,
                     labels,
                     duration_model,
                     duration_config,
                     duration_in_scaler,
                     duration_out_scaler,
                     lag,
                     binary_dict,
                     continuous_dict,
                     pitch_indices=None,
                     log_f0_conditioning=True):
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                duration_linguistic_features, idx, log_f0_conditioning),
                                                            kind="slinear")

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features)
    if isinstance(duration_in_scaler, MinMaxScaler):
        # clip to feature range
        duration_linguistic_features = np.clip(
            duration_linguistic_features, duration_in_scaler.feature_range[0],
            duration_in_scaler.feature_range[1])

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if duration_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]])
        if np.any(duration_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * duration_out_scaler.var_
            max_mu = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_durations = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_durations = duration_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_durations = duration_out_scaler.inverse_transform(pred_durations)
        if np.any(duration_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                pred_durations, duration_out_scaler.var_,
                get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)

    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
Exemple #29
0
    def collect_features(self, wav_path, label_path):
        labels = hts.load(label_path)
        l_features = fe.linguistic_features(
            labels, self.binary_dict, self.continuous_dict,
            add_frame_features=True,
            subphone_features="coarse_coding")

        f0_score = _midi_to_hz(l_features, self.pitch_idx, False)
        notes = l_features[:, self.pitch_idx]
        notes = notes[notes > 0]
        # allow 1-tone upper/lower
        min_f0 = librosa.midi_to_hz(min(notes) - 2)
        max_f0 = librosa.midi_to_hz(max(notes) + 2)
        assert max_f0 > min_f0

        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)

        if self.use_harvest:
            f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period,
            f0_floor=min_f0, f0_ceil=max_f0)
        else:
            f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period,
                f0_floor=min_f0, f0_ceil=max_f0)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order,
                           alpha=pysptk.util.mcepalpha(fs))
        # F0 of speech
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if self.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        # Adjust lengths
        mgc = mgc[:labels.num_frames()]
        lf0 = lf0[:labels.num_frames()]
        vuv = vuv[:labels.num_frames()]
        bap = bap[:labels.num_frames()]

        if self.relative_f0:
            # # F0 derived from the musical score
            f0_score = f0_score[:, None]
            lf0_score = f0_score.copy()
            nonzero_indices = np.nonzero(f0_score)
            lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
            lf0_score = interp1d(lf0_score, kind="slinear")
            # relative f0
            diff_lf0 = lf0 - lf0_score
            diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0))

            f0_target = diff_lf0
        else:
            f0_target = lf0

        mgc = apply_delta_windows(mgc, self.windows)
        f0_target = apply_delta_windows(f0_target, self.windows)
        bap = apply_delta_windows(bap, self.windows)

        features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32)

        # Align waveform and features
        wave = x.astype(np.float32) / 2**15
        T = int(features.shape[0] * (fs * self.frame_period / 1000))
        if len(wave) < T:
            if T - len(wave) > 100:
                print("Warn!!", T, len(wave), T-len(wave))
                print("you have unepxcted input. Please debug though ipdb")
                import ipdb; ipdb.set_trace()
            else:
                pass
            wave = np.pad(wave, (0, T-len(wave)))
        assert wave.shape[0] >= T
        wave = wave[:T]

        return features, wave
Exemple #30
0
def predict_acoustic(device,
                     labels,
                     acoustic_model,
                     acoustic_config,
                     acoustic_in_scaler,
                     acoustic_out_scaler,
                     binary_dict,
                     continuous_dict,
                     subphone_features="coarse_coding",
                     pitch_indices=None,
                     log_f0_conditioning=True):

    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features=subphone_features)

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(_midi_to_hz(
                linguistic_features, idx, log_f0_conditioning),
                                                   kind="slinear")

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range
        linguistic_features = np.clip(linguistic_features,
                                      acoustic_in_scaler.feature_range[0],
                                      acoustic_in_scaler.feature_range[1])

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC:
        log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]])
        if np.any(acoustic_config.has_dynamic_features):
            # (B, T, D_out)
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)

            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * acoustic_out_scaler.var_
            max_mu = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_acoustic = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_acoustic = acoustic_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
        if np.any(acoustic_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                pred_acoustic, acoustic_out_scaler.var_,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)

    return pred_acoustic
Exemple #31
0
    def collect_features(self, wav_path, label_path):
        labels = hts.load(label_path)
        l_features = fe.linguistic_features(labels,
                                            self.binary_dict,
                                            self.continuous_dict,
                                            add_frame_features=True,
                                            subphone_features="coarse_coding")

        f0_score = midi_to_hz(l_features, self.pitch_idx, False)
        # TODO: better to set the margin carefully
        max_f0 = int(max(f0_score)) + 100
        min_f0 = int(max(self.f0_floor, min(f0_score[f0_score > 0]) - 20))
        assert max_f0 > min_f0

        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)

        if self.use_harvest:
            f0, timeaxis = pyworld.harvest(x,
                                           fs,
                                           frame_period=self.frame_period,
                                           f0_floor=min_f0,
                                           f0_ceil=max_f0)
        else:
            f0, timeaxis = pyworld.dio(x,
                                       fs,
                                       frame_period=frame_period,
                                       f0_floor=min_f0,
                                       f0_ceil=max_f0)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x,
                                         f0,
                                         timeaxis,
                                         fs,
                                         f0_floor=self.f0_floor)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=self.mgc_order,
                           alpha=pysptk.util.mcepalpha(fs))
        # F0 of speech
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if self.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        # # F0 derived from the musical score
        f0_score = f0_score[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(f0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        # Adjust lengths
        mgc = mgc[:labels.num_frames()]
        lf0 = lf0[:labels.num_frames()]
        vuv = vuv[:labels.num_frames()]
        bap = bap[:labels.num_frames()]

        diff_lf0 = lf0 - lf0_score
        diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0))

        mgc = apply_delta_windows(mgc, self.windows)
        diff_lf0 = apply_delta_windows(diff_lf0, self.windows)
        bap = apply_delta_windows(bap, self.windows)

        features = np.hstack((mgc, diff_lf0, vuv, bap))

        return features.astype(np.float32)
Exemple #32
0
def predict_duration(
    device,
    labels,
    duration_model,
    duration_config,
    duration_in_scaler,
    duration_out_scaler,
    binary_dict,
    numeric_dict,
    pitch_indices=None,
    log_f0_conditioning=True,
    force_clip_input_features=False,
):
    """Predict phoneme durations from HTS labels

    Args:
        device (torch.device): device to run the model on
        labels (nnmnkwii.io.hts.HTSLabelFile): labels
        duration_model (nn.Module): duration model
        duration_config (dict): duration config
        duration_in_scaler (sklearn.preprocessing.MinMaxScaler): duration input scaler
        duration_out_scaler (sklearn.preprocessing.MinMaxScaler): duration output scaler
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to use log-f0 conditioning
        force_clip_input_features (bool): whether to clip input features

    Returns:
        np.ndarray: predicted durations
    """
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=False,
        subphone_features=None,
    ).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features
    )
    if force_clip_input_features and isinstance(duration_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(duration_linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        duration_linguistic_features[:, non_pitch_indices] = np.clip(
            duration_linguistic_features[:, non_pitch_indices],
            duration_in_scaler.feature_range[0],
            duration_in_scaler.feature_range[1],
        )

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if duration_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = duration_model.inference(x, [x.shape[1]])
        if np.any(duration_config.has_dynamic_features):
            raise RuntimeError(
                "Dynamic features are not supported for duration modeling"
            )
        # Apply denormalization
        max_sigma_sq = (
            max_sigma.squeeze(0).cpu().data.numpy() ** 2 * duration_out_scaler.var_
        )
        max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
        max_mu = duration_out_scaler.inverse_transform(
            max_mu.squeeze(0).cpu().data.numpy()
        )

        return max_mu, max_sigma_sq
    else:
        # (T, D_out)
        pred_durations = (
            duration_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_durations = duration_out_scaler.inverse_transform(pred_durations)
        if np.any(duration_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                pred_durations,
                duration_out_scaler.var_,
                get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features,
            )

    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations