Beispiel #1
0
 def __init__(self,
              add_frame_features=False,
              subphone_features=None,
              use_phone_alignment=False,
              question_path=None):
     self.add_frame_features = add_frame_features
     self.subphone_features = subphone_features
     self.test_paths = None
     self.use_phone_alignment = use_phone_alignment
     if question_path is None:
         self.binary_dict, self.continuous_dict = hts.load_question_set(
             join(DATA_ROOT, "questions-radio_dnn_416.hed"))
     else:
         self.binary_dict, self.continuous_dict = hts.load_question_set(
             question_path)
Beispiel #2
0
    def __init__(self,
                 wav_root,
                 label_root,
                 question_path,
                 use_harvest=True,
                 f0_floor=150,
                 f0_ceil=700,
                 frame_period=5,
                 mgc_order=59):
        self.wav_root = wav_root
        self.label_root = label_root
        self.binary_dict, self.continuous_dict = hts.load_question_set(
            question_path, append_hat_for_LL=False)
        self.pitch_idx = len(self.binary_dict) + 1
        self.use_harvest = use_harvest
        self.f0_floor = f0_floor
        self.f0_ceil = f0_ceil
        self.frame_period = frame_period
        self.mgc_order = mgc_order

        self.windows = [
            (0, 0, np.array([1.0])),
            (1, 1, np.array([-0.5, 0.0, 0.5])),
            (1, 1, np.array([1.0, -2.0, 1.0])),
        ]
Beispiel #3
0
def _process_feature(out_dir,
                     index,
                     label_path,
                     add_frame_features=False,
                     subphone_features=None,
                     question_path=None):

    labels = hts.load(label_path)
    binary_dict, continuous_dict = hts.load_question_set(question_path)
    features = fe.linguistic_features(labels,
                                      binary_dict,
                                      continuous_dict,
                                      add_frame_features=add_frame_features,
                                      subphone_features=subphone_features)
    n_frames = len(features)
    if add_frame_features:
        indices = labels.silence_frame_indices().astype(np.int)
    else:
        indices = labels.silence_phone_indices()
    features = np.delete(features, indices, axis=0)
    voiced_frames = features.shape[0]

    # Write the linguistic to disk:
    linguistic_filename = 'arctic_%05d.npy' % index
    np.save(os.path.join(out_dir, linguistic_filename),
            features.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (linguistic_filename, n_frames, voiced_frames)
Beispiel #4
0
def get_linguistic_feature(lab_path, question_path, level='phone'):
    if level == 'phone':
        add_frame_features = False
        subphone_features = None
    elif level == 'frame':
        add_frame_features = True
        subphone_features = 'coarse_coding'
    else:
        raise ValueError(
            f'phone and frame are supported, but level={level} is given.')

    binary_dict, continuous_dict = hts.load_question_set(question_path)
    labels = hts.load(lab_path)
    feature = fe.linguistic_features(labels,
                                     binary_dict,
                                     continuous_dict,
                                     add_frame_features=add_frame_features,
                                     subphone_features=subphone_features)

    if add_frame_features:
        indices = labels.silence_frame_indices().astype(int)
    else:
        indices = labels.silence_phone_indices()
    feature = np.delete(feature, indices, axis=0)

    return feature.astype(np.float32)
Beispiel #5
0
def test_linguistic_and_duration_features_for_duration_model():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    # Phone-level linguistic features
    # Linguistic features
    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    assert labels.is_state_alignment_label()
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=False,
                               subphone_features=None)
    y = np.fromfile(join(DATA_DIR, "binary_label_416", "arctic_a0001.lab"),
                    dtype=np.float32).reshape(-1, x.shape[-1])
    assert np.allclose(x, y)

    # Duration features
    labels = hts.load(input_state_label)
    x = fe.duration_features(labels,
                             feature_type="numerical",
                             unit_size="state",
                             feature_size="phoneme")
    y = np.fromfile(join(DATA_DIR, "duration_untrimmed", "arctic_a0001.dur"),
                    dtype=np.float32).reshape(-1, x.shape[-1])

    assert np.allclose(x, y)
Beispiel #6
0
def test_singing_voice_question():
    # Test SVS case
    """
QS "L-Phone_Yuusei_Boin"           {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*}
CQS "e1" {/E:(\\NOTE)]}
    """
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "test_jp_svs.hed"),
        append_hat_for_LL=False,
        convert_svs_pattern=True)
    input_phone_label = join(DATA_DIR, "song070_f00001_063.lab")
    labels = hts.load(input_phone_label)
    feats = fe.linguistic_features(labels, binary_dict, continuous_dict)
    assert feats.shape == (74, 3)

    # CQS e1: get the current MIDI number
    C_e1 = continuous_dict[0]
    for idx, lab in enumerate(labels):
        context = lab[-1]
        if C_e1.search(context) is not None:
            from nnmnkwii.frontend import NOTE_MAPPING
            assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1]

    # CQS e57: get pitch diff
    # In contrast to other continous features, the pitch diff has a prefix "m" or "p"
    # to indiecate th sign of numbers.
    C_e57 = continuous_dict[1]
    for idx, lab in enumerate(labels):
        context = lab[-1]
        if "~p2+" in context:
            assert C_e57.search(context).group(1) == "p2"
            assert feats[idx, 2] == 2
        if "~m2+" in context:
            assert C_e57.search(context).group(1) == "m2"
            assert feats[idx, 2] == -2
Beispiel #7
0
 def __init__(self,
              utt_list,
              wav_root,
              label_root,
              question_path,
              use_harvest=True,
              f0_floor=150,
              f0_ceil=700,
              frame_period=5,
              mgc_order=59,
              num_windows=3,
              relative_f0=True,
              interp_unvoiced_aperiodicity=True):
     self.utt_list = utt_list
     self.wav_root = wav_root
     self.label_root = label_root
     self.binary_dict, self.continuous_dict = hts.load_question_set(
         question_path, append_hat_for_LL=False)
     self.pitch_idx = len(self.binary_dict) + 1
     self.use_harvest = use_harvest
     self.f0_floor = f0_floor
     self.f0_ceil = f0_ceil
     self.frame_period = frame_period
     self.mgc_order = mgc_order
     self.relative_f0 = relative_f0
     self.interp_unvoiced_aperiodicity = interp_unvoiced_aperiodicity
     self.windows = get_windows(num_windows)
Beispiel #8
0
 def __init__(self, data_root, max_files=None, add_frame_features=False,
              subphone_features=None):
     self.data_root = data_root
     self.max_files = max_files
     self.add_frame_features = add_frame_features
     self.subphone_features = subphone_features
     self.binary_dict, self.continuous_dict = hts.load_question_set(
         hp_acoustic.question_path)
Beispiel #9
0
def test_labels_number_of_frames():
    # https://github.com/r9y9/nnmnkwii/issues/85
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "jp.hed"))
    labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab"))
    linguistic_features = fe.linguistic_features(
        labels, binary_dict, continuous_dict, add_frame_features=True)
    assert labels.num_frames() == linguistic_features.shape[0]
Beispiel #10
0
def load_qst(question_path, append_hat_for_LL=False) -> tuple:
    """
    question.hed ファイルを読み取って、
    binary_dict, continuous_dict, pitch_idx, pitch_indices を返す。
    """
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=append_hat_for_LL)
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)
    pitch_idx = len(binary_dict) + 1
    return (binary_dict, continuous_dict, pitch_indices, pitch_idx)
Beispiel #11
0
 def __init__(self, utt_list, data_root, question_path, add_frame_features=False,
             subphone_features=None, log_f0_conditioning=True):
     self.data_root = data_root
     self.utt_list = utt_list
     self.add_frame_features = add_frame_features
     self.subphone_features = subphone_features
     self.binary_dict, self.continuous_dict = hts.load_question_set(
         question_path, append_hat_for_LL=False)
     self.log_f0_conditioning = log_f0_conditioning
     self.pitch_idx = np.arange(len(self.binary_dict), len(self.binary_dict)+3)
 def __init__(self, data_root=None, question_path=None, max_num_files=-1):
     
     # Build *.lab file list
     self.input_lab_files = sorted(glob.glob(DATA_ROOT + "/label_phone_align/*.lab")) # All *.lab files into list
     if max_num_files is not -1 : self.input_lab_files = self.input_lab_files[:max_num_files]
     #self.file_ids = [path.split('/')[-1][:-4] for path in input_lab_files] # File names without extensions in a list
     
     # Build dictionary from *.hed question file
     self.binary_dict, self.continuous_dict = hts.load_question_set(question_path)
     
     return None
Beispiel #13
0
def synthesis(config, device, label_path, question_path, timelag_model,
              timelag_config, timelag_in_scaler, timelag_out_scaler,
              duration_model, duration_config, duration_in_scaler,
              duration_out_scaler, acoustic_model, acoustic_config,
              acoustic_in_scaler, acoustic_out_scaler):
    # load labels and question
    labels = hts.load(label_path).round_()
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)

    # pitch indices in the input features
    # TODO: configuarable
    pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    log_f0_conditioning = config.log_f0_conditioning

    if config.ground_truth_duration:
        # Use provided alignment
        duration_modified_labels = labels
    else:
        # Time-lag
        lag = predict_timelag(device, labels, timelag_model, timelag_config,
                              timelag_in_scaler, timelag_out_scaler,
                              binary_dict, continuous_dict, pitch_indices,
                              log_f0_conditioning,
                              config.timelag.allowed_range)

        # Timelag predictions
        durations = predict_duration(device, labels, duration_model,
                                     duration_config, duration_in_scaler,
                                     duration_out_scaler, lag, binary_dict,
                                     continuous_dict, pitch_indices,
                                     log_f0_conditioning)

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

    # Predict acoustic features
    acoustic_features = predict_acoustic(
        device, duration_modified_labels, acoustic_model, acoustic_config,
        acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict,
        config.acoustic.subphone_features, pitch_indices, log_f0_conditioning)

    # Waveform generation
    generated_waveform = gen_waveform(
        duration_modified_labels, acoustic_features, binary_dict,
        continuous_dict, acoustic_config.stream_sizes,
        acoustic_config.has_dynamic_features,
        config.acoustic.subphone_features, log_f0_conditioning, pitch_idx,
        acoustic_config.num_windows, config.acoustic.post_filter,
        config.sample_rate, config.frame_period, config.acoustic.relative_f0)

    return generated_waveform
Beispiel #14
0
def test_htk_style_question_basics():
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "test_question.hed"))
    # sil k o n i ch i w a sil
    input_phone_label = join(DATA_DIR, "hts-nit-atr503", "phrase01.lab")
    labels = hts.load(input_phone_label)

    # Test if we can handle wildcards correctly
    # also test basic phon contexts (LL, L, C, R, RR)
    """
QS "LL-Phone_Muon1"  {sil^,pau^}    # without wildcards (*)
QS "LL-Phone_Muon2"  {sil^*,pau^*}  # with *, should be equivalent with above
QS "L-Phone_Muon1"   {*^sil-*,*^pau-*}
QS "C-Phone_sil"     {*-sil+*}
QS "R-Phone_o"       {*+o=*}
QS "RR-Phone_o"      {*=o/A:*}
    """
    LL_muon1 = binary_dict[0][0]
    LL_muon2 = binary_dict[1][0]
    L_muon1 = binary_dict[2][0]
    C_sil = binary_dict[3][0]
    R_phone_o = binary_dict[4][0]
    RR_phone_o = binary_dict[5][0]

    # xx^xx-sil+k=o
    label = labels[0][-1]
    assert LL_muon1.search(label) is None
    assert LL_muon2.search(label) is None
    assert L_muon1.search(label) is None
    assert C_sil.search(label)
    assert R_phone_o.search(label) is None
    assert RR_phone_o.search(label)

    # xx^sil-k+o=N
    label = labels[1][-1]
    assert LL_muon1.search(label) is None
    assert LL_muon2.search(label) is None
    assert L_muon1.search(label)
    assert C_sil.search(label) is None
    assert R_phone_o.search(label)
    assert RR_phone_o.search(label) is None

    # sil^k-o+N=n
    label = labels[2][-1]
    assert LL_muon1.search(label)
    assert LL_muon2.search(label)
    assert L_muon1.search(label) is None
    assert C_sil.search(label) is None
    assert R_phone_o.search(label) is None
    assert RR_phone_o.search(label) is None

    # Slice/list indexing
    assert str(labels[:2]) == str(labels[[0, 1]])
Beispiel #15
0
def test_correct_vuv_by_phone():
    wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav"
    lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab"

    binary_dict, numeric_dict = hts.load_question_set(
        Path(__file__).parent / "data" / "jp_test.hed"
    )

    labels = hts.load(lab_path)
    sr, wav = wavfile.read(wav_path)
    wav = wav.astype(np.float64)
    assert sr == 48000

    out_feats, stream_sizes = _extract_static_feats(wav, sr)
    has_dynamic_features = [False] * len(stream_sizes)
    pitch_idx = len(binary_dict) + 1

    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features="coarse_coding",
    )

    params = {
        "labels": labels,
        "acoustic_features": out_feats,
        "binary_dict": binary_dict,
        "numeric_dict": numeric_dict,
        "stream_sizes": stream_sizes,
        "has_dynamic_features": has_dynamic_features,
        "pitch_idx": pitch_idx,
        "relative_f0": False,
        "frame_period": 5,
    }

    out_vuv_idx = 61
    vuv = out_feats[:, out_vuv_idx : out_vuv_idx + 1]

    vuv_corrected = correct_vuv_by_phone(vuv, binary_dict, linguistic_features)
    # by correcting VUV should make a difference
    _, _, vuv_fixed, _ = gen_spsvs_static_features(**{**params, "force_fix_vuv": True})
    assert np.any(vuv_corrected != vuv)

    # 0: Rest 1: Voiced 2: Unvoiced
    rest_idx = 0
    voiced_idx = 1
    unvoiced_idx = 2
    assert np.all(vuv_corrected[linguistic_features[:, rest_idx] > 0] < 0.5)
    assert np.all(vuv_corrected[linguistic_features[:, voiced_idx] > 0] > 0.5)
    assert np.all(vuv_corrected[linguistic_features[:, unvoiced_idx] > 0] < 0.5)
Beispiel #16
0
def test_invalid_linguistic_features():
    binary_dict, continuous_dict = hts.load_question_set(
        example_question_file())
    phone_labels = hts.load(example_label_file(phone_level=True))
    state_labels = hts.load(example_label_file(phone_level=False))

    @raises(ValueError)
    def __test(labels, subphone_features, add_frame_features):
        fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               subphone_features=subphone_features,
                               add_frame_features=add_frame_features)

    yield __test, phone_labels, "full", True
    yield __test, phone_labels, "full", False
    yield __test, state_labels, "full", False
Beispiel #17
0
    def __init__(self,
                 duration_model,
                 acoustic_model,
                 question_file,
                 config,
                 duration_dataset=None,
                 acoustic_dataset=None,
                 device='cpu'):
        self.duration_model = duration_model
        self.acoustic_model = acoustic_model

        self.duration_dataset = duration_dataset
        self.acoustic_dataset = acoustic_dataset

        self.bin_dict, self.con_dict = hts.load_question_set(question_file)
        self.config = config
        self.device = device
Beispiel #18
0
def test_linguistic_features_for_acoustic_model():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    # Linguistic features
    # To train acoustic model paired with linguistic features,
    # we need frame-level linguistic feature representation.
    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    assert labels.is_state_alignment_label()
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=True,
                               subphone_features="full")
    y = np.fromfile(join(DATA_DIR, "binary_label_425", "arctic_a0001.lab"),
                    dtype=np.float32).reshape(-1, x.shape[-1])
    assert np.allclose(x, y)
Beispiel #19
0
 def __init__(
     self,
     utt_list,
     wav_root,
     label_root,
     question_path,
     use_harvest=True,
     f0_floor=150,
     f0_ceil=700,
     frame_period=5,
     mgc_order=59,
     num_windows=3,
     relative_f0=True,
     interp_unvoiced_aperiodicity=True,
     vibrato_mode="none",  # diff, sine
     sample_rate=48000,
     d4c_threshold=0.85,
     trajectory_smoothing=False,
     trajectory_smoothing_cutoff=50,
     correct_vuv=False,
 ):
     self.utt_list = utt_list
     self.wav_root = wav_root
     self.label_root = label_root
     self.binary_dict, self.continuous_dict = hts.load_question_set(
         question_path, append_hat_for_LL=False
     )
     self.pitch_idx = len(self.binary_dict) + 1
     self.use_harvest = use_harvest
     self.f0_floor = f0_floor
     self.f0_ceil = f0_ceil
     self.frame_period = frame_period
     self.mgc_order = mgc_order
     self.relative_f0 = relative_f0
     self.interp_unvoiced_aperiodicity = interp_unvoiced_aperiodicity
     self.vibrato_mode = vibrato_mode
     self.windows = get_windows(num_windows)
     self.sample_rate = sample_rate
     self.d4c_threshold = d4c_threshold
     self.trajectory_smoothing = trajectory_smoothing
     self.trajectory_smoothing_cutoff = trajectory_smoothing_cutoff
     self.correct_vuv = correct_vuv
Beispiel #20
0
def test_singing_voice_question():
    # Test SVS case
    """
QS "L-Phone_Yuusei_Boin"           {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*}
CQS "e1" {/E:(\\NOTE)]}
    """
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "test_jp_svs.hed"), append_hat_for_LL=False)
    input_phone_label = join(DATA_DIR, "song070_f00001_063.lab")
    labels = hts.load(input_phone_label)
    feats = fe.linguistic_features(labels, binary_dict, continuous_dict)
    assert feats.shape == (74, 2)

    # CQS e1: get the current MIDI number
    C_e1 = continuous_dict[0]
    for idx, lab in enumerate(labels):
        context = lab[-1]
        if C_e1.search(context) is not None:
            from nnmnkwii.frontend import NOTE_MAPPING
            assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1]
Beispiel #21
0
def test_silence_frame_removal_given_hts_labels():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    features = fe.linguistic_features(labels,
                                      binary_dict,
                                      continuous_dict,
                                      add_frame_features=True,
                                      subphone_features="full")

    # Remove silence frames
    indices = labels.silence_frame_indices()
    features = np.delete(features, indices, axis=0)

    y = np.fromfile(join(DATA_DIR, "nn_no_silence_lab_425",
                         "arctic_a0001.lab"),
                    dtype=np.float32).reshape(-1, features.shape[-1])
    assert features.shape == y.shape
    assert np.allclose(features, y)
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []

    speakers = available_speakers

    wd = WavFileDataSource(in_dir, speakers=speakers)
    td = TranscriptionFileDataSource(in_dir, speakers=speakers)
    wav_paths = wd.collect_files()
    lab_paths = td.collect_files()
    speaker_ids = wd.labels
    binary_dict, continuous_dict = hts.load_question_set(
        join(in_dir, "questions", hparams.question_fn))

    result = []
    for index, (speaker_id, wav_path,
                lab_path) in enumerate(zip(speaker_ids, wav_paths, lab_paths)):
        result.append(
            _process_utterance(out_dir, index + 1, speaker_id, wav_path,
                               lab_path, binary_dict, continuous_dict, "N/A"))
    return result
Beispiel #23
0
def test_gen_spsvs_static_features():
    wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav"
    lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab"

    binary_dict, numeric_dict = hts.load_question_set(
        Path(__file__).parent / "data" / "jp_test.hed"
    )

    labels = hts.load(lab_path)
    sr, wav = wavfile.read(wav_path)
    wav = wav.astype(np.float64)
    assert sr == 48000

    out_feats, stream_sizes = _extract_static_feats(wav, sr)
    has_dynamic_features = [False] * len(stream_sizes)
    pitch_idx = len(binary_dict) + 1

    params = {
        "labels": labels,
        "acoustic_features": out_feats,
        "binary_dict": binary_dict,
        "numeric_dict": numeric_dict,
        "stream_sizes": stream_sizes,
        "has_dynamic_features": has_dynamic_features,
        "pitch_idx": pitch_idx,
        "relative_f0": False,
        "frame_period": 5,
        "force_fix_vuv": False,
    }

    mgc, lf0, vuv, bap = gen_spsvs_static_features(**params)
    assert mgc.shape[1] == 60
    assert lf0.shape[1] == 1
    assert vuv.shape[1] == 1
    assert bap.shape[1] == 5

    # w/o V/UV correction, vuv should't change
    out_vuv_idx = 61
    N = np.abs(vuv - out_feats[:, out_vuv_idx : out_vuv_idx + 1]).sum()
    assert int(N) == 0
    def __init__(
        self,
        duration_model,
        acoustic_model,
        scaler,
        config,
        speakers,
        question_path,
        device,
    ):
        self.duration_model = duration_model
        self.acoustic_model = acoustic_model
        self.scaler = scaler
        self.binary_dict, self.continuous_dict = hts.load_question_set(question_path)
        self.device = device
        self.config = config

        # prepare window
        if config['n_delta'] in (0, 1, 2):
            windows = [
                (0, 0, np.array([1.0])),
                (1, 1, np.array([-0.5, 0.0, 0.5])),
                (1, 1, np.array([1.0, -2.0, 1.0])),
            ]
            self.windows = windows[:config['n_delta'] + 1]
        else:
            raise ValueError('only n_delta = 0, 1, 2 is supported')

        self.lf0_start_idx = (config['mcep_order'] + 1) * (config['n_delta'] + 1)
        self.vuv_start_idx = self.lf0_start_idx + (config['n_delta'] + 1)
        self.bap_start_idx = self.vuv_start_idx + 1

        # prepare speaker codes dictionary
        with open(speakers, 'r') as f:
            speakers = [item.strip() for item in f]
        speaker_codes = np.eye(len(speakers), dtype=np.float32)
        self.speaker_codes = {speakers[i]: speaker_codes[i] for i in range(len(speakers))}
Beispiel #25
0
def test_phone_alignment_label():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    input_state_label = join(DATA_DIR, "label_phone_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=False,
                               subphone_features=None)
    assert not labels.is_state_alignment_label()
    assert np.all(np.isfinite(x))

    for subphone_features in ["coarse_coding", "minimal_phoneme"]:
        x = fe.linguistic_features(labels,
                                   binary_dict,
                                   continuous_dict,
                                   add_frame_features=True,
                                   subphone_features=subphone_features)
        assert np.all(np.isfinite(x))

    x = fe.duration_features(labels)
    assert np.all(np.isfinite(x))
Beispiel #26
0
from nnmnkwii.frontend import merlin as fe
from nnmnkwii.postfilters import merlin_post_filter

import gantts
from gantts.multistream import multi_stream_mlpg, get_static_features
from gantts.multistream import get_static_stream_sizes, select_streams
from gantts.seqloss import MaskedMSELoss, sequence_mask

from hparams import tts_acoustic as hp_acoustic
from hparams import tts_duration as hp_duration

from train import NPYDataSource

use_cuda = torch.cuda.is_available()

binary_dict, continuous_dict = hts.load_question_set(hp_acoustic.question_path)


def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True):
    mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes

    mgc_start_idx = 0
    lf0_start_idx = mgc_dim
    vuv_start_idx = lf0_start_idx + lf0_dim
    bap_start_idx = vuv_start_idx + vuv_dim

    windows = hp_acoustic.windows

    ty = "acoustic"

    # MGE training
Beispiel #27
0
from nnmnkwii.postfilters import merlin_post_filter

import gantts
from gantts.multistream import multi_stream_mlpg, get_static_features
from gantts.multistream import get_static_stream_sizes, select_streams
from gantts.seqloss import MaskedMSELoss, sequence_mask

from hparams import tts_acoustic as hp_acoustic
from hparams import tts_duration as hp_duration

from train import NPYDataSource


use_cuda = torch.cuda.is_available()

binary_dict, continuous_dict = hts.load_question_set(hp_acoustic.question_path)


def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True):
    mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes

    mgc_start_idx = 0
    lf0_start_idx = mgc_dim
    vuv_start_idx = lf0_start_idx + lf0_dim
    bap_start_idx = vuv_start_idx + vuv_dim

    windows = hp_acoustic.windows

    ty = "acoustic"

    # MGE training
Beispiel #28
0
def acoustic2world(config: DictConfig, path_timing, path_acoustic, path_f0,
                   path_spcetrogram, path_aperiodicity):
    """
    Acousticの行列のCSVを読んで、WAVファイルとして出力する。
    """
    # loggerの設定
    global logger  # pylint: disable=global-statement
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    # load labels and question
    duration_modified_labels = hts.load(path_timing).round_()

    # CUDAが使えるかどうか
    # device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # 各種設定を読み込む
    typ = 'acoustic'
    model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml))

    # hedファイルを読み取る。
    question_path = to_absolute_path(config.question_path)
    # hts2wav.pyだとこう↓-----------------
    # これだと各モデルに別個のhedを適用できる。
    # if config[typ].question_path is None:
    #     config[typ].question_path = config.question_path
    # --------------------------------------

    # hedファイルを辞書として読み取る。
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)

    # pitch indices in the input features
    pitch_idx = len(binary_dict) + 1
    # pitch_indices = np.arange(len(binary_dict), len(binary_dict)+3)

    # pylint: disable=no-member
    # Acousticの数値を読み取る
    acoustic_features = np.loadtxt(path_acoustic,
                                   delimiter=',',
                                   dtype=np.float64)

    # AcousticからWORLD用のパラメータを取り出す。
    f0, spectrogram, aperiodicity = gen_world_params(
        duration_modified_labels,
        acoustic_features,
        binary_dict,
        continuous_dict,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
        subphone_features=config.acoustic.subphone_features,
        pitch_idx=pitch_idx,
        num_windows=model_config.num_windows,
        post_filter=config.acoustic.post_filter,
        sample_rate=config.sample_rate,
        frame_period=config.frame_period,
        relative_f0=config.acoustic.relative_f0,
        vibrato_scale=1.0,
        vuv_threshold=0.3)

    # csvファイルとしてf0の行列を出力
    for path, array in ((path_f0, f0), (path_spcetrogram, spectrogram),
                        (path_aperiodicity, aperiodicity)):
        np.savetxt(path, array, fmt='%.16f', delimiter=',')
Beispiel #29
0
def test_load_question_set():
    binary_dict, continuous_dict = hts.load_question_set(
        example_question_file())
    assert len(binary_dict) + len(continuous_dict) == 416
Beispiel #30
0
def _score2duration(config: DictConfig, labels):
    """
    full_score と timelag ラベルから durationラベルを生成する。
    """
    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------
    # loggerの設定
    global logger  # pylint: disable=global-statement
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    typ = 'duration'
    # CUDAが使えるかどうか
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # maybe_set_checkpoints_(config) のかわり
    set_checkpoint(config, typ)
    # maybe_set_normalization_stats_(config) のかわり
    set_normalization_stat(config, typ)

    # 各種設定を読み込む
    model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(config[typ].checkpoint,
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint['state_dict'])
    in_scaler = joblib.load(config[typ].in_scaler_path)
    out_scaler = joblib.load(config[typ].out_scaler_path)
    model.eval()
    # -----------------------------------------------------
    # ここまで nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------

    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.synthesis() の内容 -----
    # -----------------------------------------------------
    # full_score_lab を読み取る。
    # labels = hts.load(score_path).round_()
    # いまのduraitonモデルだと使わない
    # timelag = hts.load(timelag_path).round_()

    # hedファイルを読み取る。
    question_path = to_absolute_path(config.question_path)
    # hts2wav.pyだとこう↓-----------------
    # これだと各モデルに別個のhedを適用できる。
    # if config[typ].question_path is None:
    #     config[typ].question_path = config.question_path
    # --------------------------------------
    # hedファイルを辞書として読み取る。
    binary_dict, continuous_dict = \
        hts.load_question_set(question_path, append_hat_for_LL=False)
    # pitch indices in the input features
    # pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    # f0の設定を読み取る。
    log_f0_conditioning = config.log_f0_conditioning

    # durationモデルを適用
    duration = predict_duration(device,
                                labels,
                                model,
                                model_config,
                                in_scaler,
                                out_scaler,
                                binary_dict,
                                continuous_dict,
                                pitch_indices,
                                log_f0_conditioning,
                                force_clip_input_features=False)
    # durationのタプルまたはndarrayを返す
    return duration
Beispiel #31
0
def timing2acoustic(config: DictConfig, timing_path, acoustic_path):
    """
    フルラベルを読み取って、音響特長量のファイルを出力する。
    """
    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------
    # loggerの設定
    global logger  # pylint: disable=global-statement
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    typ = 'acoustic'
    # CUDAが使えるかどうか
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # maybe_set_checkpoints_(config) のかわり
    set_checkpoint(config, typ)
    # maybe_set_normalization_stats_(config) のかわり
    set_normalization_stat(config, typ)

    # 各種設定を読み込む
    model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(config[typ].checkpoint,
                            map_location=lambda storage, loc: storage)

    model.load_state_dict(checkpoint['state_dict'])
    in_scaler = joblib.load(config[typ].in_scaler_path)
    out_scaler = joblib.load(config[typ].out_scaler_path)
    model.eval()
    # -----------------------------------------------------
    # ここまで nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------

    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.synthesis() の内容 -----
    # -----------------------------------------------------
    # full_score_lab を読み取る。
    duration_modified_labels = hts.load(timing_path).round_()

    # hedファイルを読み取る。
    question_path = to_absolute_path(config.question_path)
    # hts2wav.pyだとこう↓-----------------
    # これだと各モデルに別個のhedを適用できる。
    # if config[typ].question_path is None:
    #     config[typ].question_path = config.question_path
    # --------------------------------------
    # hedファイルを辞書として読み取る。
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)
    # pitch indices in the input features
    # pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    # f0の設定を読み取る。
    log_f0_conditioning = config.log_f0_conditioning
    acoustic_features = predict_acoustic(device, duration_modified_labels,
                                         model, model_config, in_scaler,
                                         out_scaler, binary_dict,
                                         continuous_dict,
                                         config.acoustic.subphone_features,
                                         pitch_indices, log_f0_conditioning)

    # csvファイルとしてAcousticの行列を出力
    np.savetxt(acoustic_path, acoustic_features, delimiter=',')