Beispiel #1
0
    def __getitem__(self, i: int):
        sampling_rate = self.sampling_rate
        length = self.sampling_length
        frequency = numpy.random.uniform(self.frequency_range[0],
                                         self.frequency_range[1])
        rand = numpy.random.rand()

        wave = numpy.sin(
            (2 * numpy.pi) * (numpy.arange(length, dtype=numpy.float32) *
                              frequency / sampling_rate + rand))

        local = numpy.log(
            numpy.ones(shape=(length // self.local_scale, 1),
                       dtype=numpy.float32) * frequency)

        silence = numpy.zeros(shape=(length, ), dtype=numpy.bool)

        return default_convert(
            self.make_input(
                wave_data=Wave(wave=wave, sampling_rate=sampling_rate),
                silence_data=SamplingData(array=silence, rate=sampling_rate),
                local_data=SamplingData(array=local,
                                        rate=sampling_rate //
                                        self.local_scale),
            ))
Beispiel #2
0
def test_extract_input(sampling_length: int, data_length: int,
                       padding_length: int):
    silence_data = SamplingData(array=numpy.zeros(data_length, dtype=bool),
                                rate=1)
    spectrogram_data = SamplingData(
        array=numpy.linspace(start=1, stop=2, num=data_length)[:,
                                                               numpy.newaxis],
        rate=1,
    )
    for _ in range(100):
        spectrogram = extract_input(
            sampling_length=sampling_length,
            spectrogram_data=spectrogram_data,
            silence_data=silence_data,
            min_not_silence_length=min(sampling_length, data_length),
            padding_length=padding_length,
            padding_value=numpy.nan,
        )["spectrogram"]

        assert len(spectrogram) == sampling_length + padding_length * 2

        if sampling_length <= data_length:
            assert numpy.isnan(spectrogram).sum() <= padding_length * 2
        else:
            assert (numpy.isnan(spectrogram).sum() == sampling_length -
                    data_length + padding_length * 2)

        if padding_length == 0:
            data = spectrogram
        else:
            data = spectrogram[padding_length:-padding_length]
        assert (~numpy.isnan(data)).sum() >= min(sampling_length, data_length)
    def test_convert_to_dict(self):
        sampling_rate = 800
        local_sampling_rate = 200
        scale = sampling_rate // local_sampling_rate
        time_length = 10
        sampling_length = 16

        wave_data = Wave(
            wave=numpy.linspace(
                0,
                sampling_rate * time_length,
                sampling_rate * time_length,
                endpoint=False,
            ),
            sampling_rate=sampling_rate,
        )
        silence_data = SamplingData(
            array=numpy.zeros((sampling_rate * time_length, ), dtype=bool),
            rate=sampling_rate,
        )
        local_data = SamplingData(
            array=numpy.linspace(
                0,
                sampling_rate * time_length,
                local_sampling_rate * time_length,
                endpoint=False,
            ),
            rate=local_sampling_rate,
        )

        wave, silence, local = BaseWaveDataset.extract_input(
            sampling_length,
            wave_data=wave_data,
            silence_data=silence_data,
            local_data=local_data,
            local_sampling_rate=local_sampling_rate,
            local_padding_size=0,
            local_mask_max_second=0,
            local_mask_num=0,
        )

        dataset = BaseWaveDataset(
            sampling_rate=sampling_rate,
            sampling_length=sampling_length,
            bit=10,
            mulaw=False,
            wave_random_max_second=0,
            wave_random_num=0,
            local_sampling_rate=local_sampling_rate,
            local_padding_size=0,
            local_mask_max_second=0,
            local_mask_num=0,
        )
        d = dataset.convert_input(wave, silence, local)
        self.assertEqual(len(d["coarse"]), sampling_length)
        self.assertEqual(len(d["encoded_coarse"]), sampling_length)
        self.assertEqual(len(d["silence"]), sampling_length - 1)
        self.assertEqual(len(d["local"]), sampling_length // scale)
    def test_extract_input(self):
        for sampling_rate, local_sampling_rate, sampling_length, time_length in [
            [800, 200, 16, 10],
            [24000, 24000 / 256, 1024, 100],
        ]:
            with self.subTest(
                    sampling_rate=sampling_rate,
                    local_sampling_rate=local_sampling_rate,
                    sampling_length=sampling_length,
                    time_length=time_length,
            ):
                scale = sampling_rate // local_sampling_rate

                wave_data = Wave(
                    wave=numpy.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(sampling_rate * time_length),
                        endpoint=False,
                    ),
                    sampling_rate=sampling_rate,
                )
                silence_data = SamplingData(
                    array=numpy.zeros((sampling_rate * time_length, ),
                                      dtype=bool),
                    rate=sampling_rate,
                )
                local_data = SamplingData(
                    array=numpy.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(local_sampling_rate * time_length),
                        endpoint=False,
                    ),
                    rate=local_sampling_rate,
                )

                for _ in range(10):
                    wave, silence, local = BaseWaveDataset.extract_input(
                        sampling_length,
                        wave_data=wave_data,
                        silence_data=silence_data,
                        local_data=local_data,
                        local_sampling_rate=local_sampling_rate,
                        local_padding_size=0,
                        local_mask_max_second=0,
                        local_mask_num=0,
                    )

                    self.assertEqual(len(wave), sampling_length)
                    self.assertEqual(len(silence), sampling_length)
                    self.assertEqual(len(local), sampling_length // scale)

                    wave_as_local = wave.reshape(int(sampling_length // scale),
                                                 -1).min(axis=1)
                    self.assertTrue(numpy.all(wave_as_local == local))
Beispiel #5
0
 def setUp(self):
     waves = [
         np.ones(self.num // 2) * -1,
         np.ones(self.num // 2),
     ]
     self.inputs = [
         Input(
             wave=Wave(wave=w, sampling_rate=self.sampling_rate),
             local=SamplingData(array=np.empty((len(w), 0)),
                                rate=self.sampling_rate),
             silence=SamplingData(array=np.zeros((len(w), ), dtype=bool),
                                  rate=self.sampling_rate),
         ) for w in waves
     ]
Beispiel #6
0
def generate_dataset(
    dataset_directory: Path,
    data_num: int,
    f0_rate: int,
    phoneme_rate: int,
    phoneme_size: int,
    speaker_size: int,
):
    if dataset_directory.exists():
        for p in dataset_directory.rglob("*"):
            if not p.is_dir():
                p.unlink()
    else:
        dataset_directory.mkdir()

    f0_dir = dataset_directory.joinpath("f0")
    phoneme_dir = dataset_directory.joinpath("phoneme")
    phoneme_list_dir = dataset_directory.joinpath("phoneme_list")

    f0_dir.mkdir(exist_ok=True)
    phoneme_dir.mkdir(exist_ok=True)
    phoneme_list_dir.mkdir(exist_ok=True)

    speaker_dict = defaultdict(list)
    for i_data in range(data_num):
        speaker_num = i_data % speaker_size
        speaker_dict[str(speaker_num)].append(str(i_data))

        source_length = int(numpy.random.randint(low=10, high=20))
        phoneme_list = numpy.random.randint(low=0,
                                            high=phoneme_size,
                                            size=source_length,
                                            dtype=numpy.int32)
        phoneme_list_dir.joinpath(f"{i_data}.lab").write_text("\n".join(
            [f"0 0 {JvsPhoneme.phoneme_list[p]}" for p in phoneme_list]))

        f0 = phoneme_list.astype(numpy.float32) / 10 + 0.2 + speaker_num / 100
        f0 = numpy.repeat(f0, (phoneme_list + 1) * (f0_rate // phoneme_rate))
        f0[::5] = 0
        SamplingData(array=f0,
                     rate=f0_rate).save(f0_dir.joinpath(f"{i_data}.npy"))

        phoneme = numpy.repeat(phoneme_list, phoneme_list + 1)
        phoneme = numpy.identity(phoneme_size, dtype=numpy.int32)[phoneme]
        SamplingData(array=phoneme, rate=phoneme_rate).save(
            phoneme_dir.joinpath(f"{i_data}.npy"))

    json.dump(speaker_dict,
              dataset_directory.joinpath("speaker_dict.json").open("w"))
Beispiel #7
0
def generate_and_save_data(
    feature_dir: Path,
    silence_dir: Path,
    wavelength: float,
    exponent: float,
    amplitude: float,
    length=300,
):
    feature, silence = generate_data(
        wavelength=wavelength,
        exponent=exponent,
        amplitude=amplitude,
        length=length,
    )

    filename = f"{wavelength}_{exponent}_{amplitude}.npy"
    SamplingData(array=feature, rate=100).save(feature_dir / filename)
    SamplingData(array=silence, rate=100).save(silence_dir / filename)
Beispiel #8
0
def test_extract_input():
    sampling_length = 10
    wave_length = 256 * sampling_length
    wave_rate = 24000
    second = wave_length / wave_rate

    f0_rate = 200
    phoneme_rate = 100
    spec_rate = wave_rate / 256
    silence_rate = 24000

    f0 = numpy.arange(int(second * f0_rate)).reshape(-1,
                                                     1).astype(numpy.float32)
    f0_data = SamplingData(array=f0, rate=f0_rate)

    phoneme = (numpy.arange(int(second * phoneme_rate)).reshape(-1, 1).astype(
        numpy.float32))
    phoneme_data = SamplingData(array=phoneme, rate=phoneme_rate)

    spec = numpy.arange(int(second * spec_rate)).reshape(-1, 1).astype(
        numpy.float32)
    spec_data = SamplingData(array=spec, rate=spec_rate)

    silence = numpy.zeros(int(second * silence_rate)).astype(bool)
    silence_data = SamplingData(array=silence, rate=silence_rate)

    phoneme_list_data = None
    volume_data = None
    f0_process_mode = F0ProcessMode.normal
    time_mask_max_second = 0
    time_mask_num = 0

    FeatureDataset.extract_input(
        sampling_length=sampling_length,
        f0_data=f0_data,
        phoneme_data=phoneme_data,
        spec_data=spec_data,
        silence_data=silence_data,
        phoneme_list_data=phoneme_list_data,
        volume_data=volume_data,
        f0_process_mode=f0_process_mode,
        time_mask_max_second=time_mask_max_second,
        time_mask_num=time_mask_num,
    )
Beispiel #9
0
    def generate(self):
        wave = Wave.load(self.path_wave)

        try:
            local = SamplingData.load(self.path_local)
        except:
            local_rate = 80
            local_array = to_log_melspectrogram(wave=wave, rate=local_rate)
            local = SamplingData(array=local_array, rate=local_rate)

            with NamedTemporaryFile(suffix=".npy", delete=False) as f:
                self.path_local = Path(f.name)
                local.save(self.path_local)

        return Input(
            wave=wave,
            silence=SamplingData.load(self.path_silence),
            local=local,
        )
Beispiel #10
0
    def test_extract_input_with_local_padding(self):
        for sampling_rate, local_sampling_rate, sampling_length, time_length, local_padding_size in [
            [800, 200, 16, 1, 100],
            [24000, 24000 / 256, 1024, 4, 1024],
        ]:
            with self.subTest(
                    sampling_rate=sampling_rate,
                    local_sampling_rate=local_sampling_rate,
                    sampling_length=sampling_length,
                    time_length=time_length,
                    local_padding_size=local_padding_size,
            ):
                scale = sampling_rate // local_sampling_rate

                wave_data = Wave(
                    wave=np.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(sampling_rate * time_length),
                        endpoint=False,
                    ),
                    sampling_rate=sampling_rate,
                )
                silence_data = SamplingData(
                    array=np.zeros((sampling_rate * time_length, ),
                                   dtype=bool),
                    rate=sampling_rate,
                )
                local_data = SamplingData(
                    array=np.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(local_sampling_rate * time_length),
                        endpoint=False,
                    ),
                    rate=local_sampling_rate,
                )
                for _ in range(10000):
                    wave, silence, local = BaseWaveDataset.extract_input(
                        sampling_length,
                        wave_data=wave_data,
                        silence_data=silence_data,
                        local_data=local_data,
                        local_padding_size=local_padding_size,
                        padding_value=np.nan,
                    )

                    self.assertEqual(len(wave), sampling_length)
                    self.assertEqual(len(silence), sampling_length)
                    self.assertEqual(
                        len(local),
                        (sampling_length + local_padding_size * 2) // scale)

                    num_pad = np.isnan(local).sum()
                    self.assertLessEqual(num_pad, local_padding_size)

                    self.assertTrue(not np.isnan(local[0])
                                    or not np.isnan(local[-1]))

                    wave_as_local = wave.reshape(int(sampling_length // scale),
                                                 -1).min(axis=1)
                    pad = int(local_padding_size // scale)
                    local_wo_pad = local[pad:-pad]
                    self.assertTrue(np.all(wave_as_local == local_wo_pad))
Beispiel #11
0
    def test_convert_to_dict(self):
        sampling_rate = 800
        local_sampling_rate = 200
        scale = sampling_rate // local_sampling_rate
        time_length = 10
        sampling_length = 16

        wave_data = Wave(
            wave=np.linspace(0,
                             sampling_rate * time_length,
                             sampling_rate * time_length,
                             endpoint=False),
            sampling_rate=sampling_rate,
        )
        silence_data = SamplingData(
            array=np.zeros((sampling_rate * time_length, ), dtype=bool),
            rate=sampling_rate,
        )
        local_data = SamplingData(
            array=np.linspace(0,
                              sampling_rate * time_length,
                              local_sampling_rate * time_length,
                              endpoint=False),
            rate=local_sampling_rate,
        )

        wave, silence, local = BaseWaveDataset.extract_input(
            sampling_length,
            wave_data=wave_data,
            silence_data=silence_data,
            local_data=local_data,
            local_padding_size=0,
        )

        dataset = BaseWaveDataset(
            sampling_length=sampling_length,
            to_double=True,
            bit=16,
            mulaw=False,
            local_padding_size=0,
        )
        d = dataset.convert_to_dict(wave, silence, local)
        self.assertEqual(len(d['coarse']), sampling_length)
        self.assertEqual(len(d['fine']), sampling_length - 1)
        self.assertEqual(len(d['encoded_coarse']), sampling_length)
        self.assertEqual(len(d['encoded_fine']), sampling_length)
        self.assertEqual(len(d['silence']), sampling_length - 1)
        self.assertEqual(len(d['local']), sampling_length // scale)

        dataset = BaseWaveDataset(
            sampling_length=sampling_length,
            to_double=False,
            bit=10,
            mulaw=False,
            local_padding_size=0,
        )
        d = dataset.convert_to_dict(wave, silence, local)
        self.assertEqual(len(d['coarse']), sampling_length)
        self.assertIsNone(d['fine'])
        self.assertEqual(len(d['encoded_coarse']), sampling_length)
        self.assertIsNone(d['encoded_fine'])
        self.assertEqual(len(d['silence']), sampling_length - 1)
        self.assertEqual(len(d['local']), sampling_length // scale)
Beispiel #12
0
def generate(
    model_dir: Path,
    model_iteration: Optional[int],
    model_config: Optional[Path],
    output_dir: Path,
    to_voiced_scaler: bool,
    to_f0_scaler: bool,
    to_phoneme_onehot: bool,
    batch_size: Optional[int],
    num_test: int,
    target_glob: Optional[str],
    use_gpu: bool,
):
    if model_config is None:
        model_config = model_dir / "config.yaml"

    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", generate, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    generator = Generator(
        config=config,
        predictor=_get_model_path(
            model_dir=model_dir,
            iteration=model_iteration,
            prefix="predictor_",
        ),
        voiced_network=(
            None
            if not to_voiced_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="voiced_network_",
            )
        ),
        f0_network=(
            None
            if not to_f0_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="f0_network_",
            )
        ),
        phoneme_network=(
            None
            if not to_phoneme_onehot
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="phoneme_network_",
            )
        ),
        use_gpu=use_gpu,
    )

    dataset = create_dataset(config.dataset)["test"]
    scale = numpy.prod(config.network.scale_list)

    if batch_size is None:
        batch_size = config.train.batch_size

    if isinstance(dataset, SpeakerWavesDataset):
        wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]]
    elif isinstance(dataset, WavesDataset):
        wave_paths = [data.path_wave for data in dataset.inputs[:num_test]]
    else:
        raise Exception()

    if target_glob is not None:
        wave_paths += list(map(Path, glob(target_glob)))

    for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"):
        waves = [Wave.load(p) for p in wps]
        arrays = [w.wave for w in waves]

        pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays]
        arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays]

        tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays]
        output = generator.generate(
            wave=concat_examples(tensors),
            to_voiced_scaler=to_voiced_scaler,
            to_f0_scaler=to_f0_scaler,
            to_phoneme_onehot=to_phoneme_onehot,
        )

        for feature, p, w, l in zip(output, wps, waves, pad_lengths):
            feature = feature.T[: l // scale]
            data = SamplingData(array=feature, rate=w.sampling_rate // scale)
            data.save(output_dir / (p.stem + ".npy"))
Beispiel #13
0
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)):
    with TemporaryDirectory() as d:
        tmp_dir = Path(d)
        input_audio_path = tmp_dir.joinpath("input.wav")
        input_audio_path.write_bytes(await wave.read())

        # openjtalk
        phonemes = [
            p.label
            for p in openjtalk_label_getter(
                text,
                openjtalk_command="open_jtalk",
                dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"),
                htsvoice_path=Path(
                    "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice"
                ),
                output_wave_path=tmp_dir.joinpath("wave.wav"),
                output_log_path=tmp_dir.joinpath("log.txt"),
                output_type=OutputType.phoneme,
                without_span=False,
            )
        ]

        # julius
        julius_audio_path = tmp_dir.joinpath("julius.wav")
        subprocess.check_call(
            f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split()
        )

        julius_phonemes = [
            p if p not in _jvs_to_julius else _jvs_to_julius[p]
            for p in phonemes
            if p != "sil"
        ]

        julius_dict_path = tmp_dir.joinpath("2nd.dict")
        julius_dict = sp_inserter.gen_julius_dict_2nd(
            " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm
        )
        julius_dict_path.write_text(julius_dict)

        julius_dfa_path = tmp_dir.joinpath("2nd.dfa")
        julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n"))
        julius_dfa_path.write_text(julius_dfa)

        julius_output = sp_inserter.julius_phone_alignment(
            str(julius_audio_path),
            str(tmp_dir.joinpath("2nd")),
            _hmm_model,
            model_type=sp_inserter.ModelType.gmm,
            options=None,
        )

        time_alignment_list = sp_inserter.frame_to_second(
            sp_inserter.get_time_alimented_list(julius_output)
        )

        i_phoneme = 0
        new_phonemes = []
        for p in phonemes:
            if p == "pau" and time_alignment_list[i_phoneme][2] != "sp":
                continue
            i_phoneme += 1
            new_phonemes.append(p)

        aligned = JvsPhoneme.convert(
            [
                JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p)
                for p, o in zip(new_phonemes, time_alignment_list)
            ]
        )
        for p in aligned:
            p.verify()

        # world
        f0 = F0.from_wave(
            Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64),
            frame_period=5.0,
            f0_floor=71.0,
            f0_ceil=800,
            with_vuv=False,
            f0_type=F0Type.world,
        )
        converted_f0 = f0.convert(
            input_mean=f0.valid_f0_log.mean(),
            input_var=f0.valid_f0_log.var(),
            target_mean=_voiro_mean,
            target_var=f0.valid_f0_log.var(),
        )
        converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1)

        # feature
        phoneme_array = LinguisticFeature(
            phonemes=aligned,
            phoneme_class=JvsPhoneme,
            rate=_feature_rate,
            feature_types=[LinguisticFeature.FeatureType.PHONEME],
        ).make_array()

        phoneme = SamplingData(array=phoneme_array, rate=_feature_rate)

        feature = SamplingData.collect(
            [converted_f0, phoneme],
            rate=_feature_rate,
            mode="min",
            error_time_length=0.015,
        )

    return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))
Beispiel #14
0
def run(text: str, speaker_id: int):
    rate = 200

    # phoneme
    utterance = extract_full_context_label(text)

    # utterance.breath_groups[0].accent_phrases[2].accent = 2
    # utterance.breath_groups[1].accent_phrases[1].accent = 6
    # utterance.breath_groups[1].accent_phrases[3].accent = 5

    x, sr = pyopenjtalk.synthesize(utterance.labels, speed=1, half_tone=0)
    x /= 2**16
    soundfile.write("hiho_openjtalk_wave.wav", x, sr)

    label_data_list = utterance.phonemes

    json.dump([p.label for p in label_data_list],
              open("hiho_label_list.json", mode="w"))

    is_type1 = False
    phoneme_str_list = []
    start_accent_list = numpy.ones(len(label_data_list),
                                   dtype=numpy.int64) * numpy.nan
    end_accent_list = numpy.ones(len(label_data_list),
                                 dtype=numpy.int64) * numpy.nan
    start_accent_phrase_list = (
        numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan)
    end_accent_phrase_list = (
        numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan)
    for i, label in enumerate(label_data_list):
        is_end_accent = label.contexts["a1"] == "0"

        if label.contexts["a2"] == "1":
            is_type1 = is_end_accent

        if label.contexts["a2"] == "1" and is_type1:
            is_start_accent = True
        elif label.contexts["a2"] == "2" and not is_type1:
            is_start_accent = True
        else:
            is_start_accent = False

        phoneme_str_list.append(label.phoneme)
        start_accent_list[i] = is_start_accent
        end_accent_list[i] = is_end_accent
        start_accent_phrase_list[i] = label.contexts["a2"] == "1"
        end_accent_phrase_list[i] = label.contexts["a3"] == "1"

    start_accent_list = numpy.array(start_accent_list, dtype=numpy.int64)
    end_accent_list = numpy.array(end_accent_list, dtype=numpy.int64)
    start_accent_phrase_list = numpy.array(start_accent_phrase_list,
                                           dtype=numpy.int64)
    end_accent_phrase_list = numpy.array(end_accent_phrase_list,
                                         dtype=numpy.int64)

    json.dump(phoneme_str_list, open("hiho_phoneme_list.json", mode="w"))

    # yukarin_s
    with open("data/yukarin_s/check-bs128-hs32/config.yaml") as f:
        d = yaml.safe_load(f)

    generator_s = GeneratorS(
        config=ConfigS.from_dict(d),
        predictor=Path("data/yukarin_s/check-bs128-hs32/predictor_50000.pth"),
        use_gpu=False,
    )

    phoneme_data_list = [
        JvsPhoneme(phoneme=p, start=i, end=i + 1)
        for i, p in enumerate(phoneme_str_list)
    ]
    phoneme_data_list = JvsPhoneme.convert(phoneme_data_list)
    phoneme_list_s = numpy.array([p.phoneme_id for p in phoneme_data_list])

    phoneme_length = generator_s.generate(
        phoneme_list=phoneme_list_s,
        speaker_id=speaker_id,
    )
    phoneme_length[0] = phoneme_length[-1] = 0.1
    phoneme_length = numpy.round(phoneme_length * rate) / rate
    numpy.save("hiho_phoneme_length.npy", phoneme_length)

    # yukarin_sa
    model_dir = Path(
        "data/yukarin_sa/withjsss-lr1.0e-03-ehs32-aehs32-pl2-pn8-fl2-fn2-try1")
    with (model_dir / "config.yaml").open() as f:
        d = yaml.safe_load(f)

    generator_sa = GeneratorSa(
        config=ConfigSa.from_dict(d),
        predictor=_get_predictor_model_path(model_dir),
        use_gpu=False,
    )

    assert generator_sa.config.dataset.f0_process_mode == "voiced_mora"
    (
        consonant_phoneme_data_list,
        vowel_phoneme_data_list,
        vowel_indexes_data,
    ) = split_mora(phoneme_data_list)

    vowel_indexes = numpy.array(vowel_indexes_data)

    vowel_phoneme_list = numpy.array(
        [p.phoneme_id for p in vowel_phoneme_data_list])
    consonant_phoneme_list = numpy.array([
        p.phoneme_id if p is not None else -1
        for p in consonant_phoneme_data_list
    ])
    phoneme_length_sa = numpy.array(
        [a.sum() for a in numpy.split(phoneme_length, vowel_indexes[:-1] + 1)])

    f0_list = generator_sa.generate(
        vowel_phoneme_list=vowel_phoneme_list[numpy.newaxis],
        consonant_phoneme_list=consonant_phoneme_list[numpy.newaxis],
        start_accent_list=start_accent_list[vowel_indexes][numpy.newaxis],
        end_accent_list=end_accent_list[vowel_indexes][numpy.newaxis],
        start_accent_phrase_list=start_accent_phrase_list[vowel_indexes][
            numpy.newaxis],
        end_accent_phrase_list=end_accent_phrase_list[vowel_indexes][
            numpy.newaxis],
        speaker_id=speaker_id,
    )[0]

    for i, p in enumerate(vowel_phoneme_data_list):
        if p.phoneme in unvoiced_mora_phoneme_list:
            f0_list[i] = 0

    numpy.save("hiho_f0_list.npy", f0_list)

    phoneme = numpy.repeat(
        phoneme_list_s,
        numpy.round(phoneme_length * rate).astype(numpy.int32))
    f0 = numpy.repeat(
        f0_list,
        numpy.round(phoneme_length_sa * rate).astype(numpy.int32))

    numpy.save("hiho_f0.npy", f0)

    # yukarin_soso
    with open(
            "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/config.yaml"
    ) as f:
        d = yaml.safe_load(f)

    generator_soso = GeneratorSoso(
        config=ConfigSoso.from_dict(d),
        predictor=Path(
            "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/predictor_220000.pth"
        ),
        use_gpu=False,
    )
    assert generator_soso.config.dataset.f0_process_mode == "voiced_mora_mean"

    array = numpy.zeros((len(phoneme), JvsPhoneme.num_phoneme),
                        dtype=numpy.float32)
    array[numpy.arange(len(phoneme)), phoneme] = 1
    phoneme = array

    f0 = SamplingData(array=f0, rate=rate).resample(24000 / 256)
    phoneme = SamplingData(array=phoneme, rate=rate).resample(24000 / 256)

    spec = generator_soso.generate(
        f0=f0[numpy.newaxis, :, numpy.newaxis],
        phoneme=phoneme[numpy.newaxis],
        speaker_id=numpy.array(speaker_id).reshape(-1),
    )[0]
    numpy.save("hiho_spec.npy", spec)

    # hifi-gan
    wave = inference_hifigan(
        x=spec.T,
        checkpoint_file="data/hifigan/g_03080000",
        config_file="data/hifigan/config.json",
    )

    # save
    soundfile.write("hiho_output.wav", data=wave, samplerate=24000)
    soundfile.write(f"{text}-{speaker_id}.wav", data=wave, samplerate=24000)
Beispiel #15
0
def generate_dataset(
    dataset_directory: Path,
    data_num: int,
    sampling_rate: int,
    local_rate: int,
    phoneme_size: int,
    speaker_size: int,
):
    if dataset_directory.exists():
        for p in dataset_directory.rglob("*"):
            if not p.is_dir():
                p.unlink()
    else:
        dataset_directory.mkdir()

    f0_dir = dataset_directory.joinpath("f0")
    phoneme_dir = dataset_directory.joinpath("phoneme")
    wave_dir = dataset_directory.joinpath("wave")
    silence_dir = dataset_directory.joinpath("silence")

    f0_dir.mkdir(exist_ok=True)
    phoneme_dir.mkdir(exist_ok=True)
    wave_dir.mkdir(exist_ok=True)
    silence_dir.mkdir(exist_ok=True)

    for i_data in range(data_num):
        local_length = int(numpy.random.randint(low=100, high=200))
        sampling_length = int(local_length / local_rate * sampling_rate)

        f0 = numpy.random.rand(local_length, 1).astype(numpy.float32)
        f0[f0 < 0.2] = 0
        f0 *= 7
        SamplingData(array=f0, rate=local_rate).save(f0_dir.joinpath(f"{i_data}.npy"))

        phoneme = numpy.random.randint(0, phoneme_size, size=local_length).astype(
            numpy.int32
        )
        phoneme = numpy.identity(phoneme_size)[phoneme].astype(numpy.int32)
        SamplingData(array=phoneme, rate=local_rate).save(
            phoneme_dir.joinpath(f"{i_data}.npy")
        )

        rand = numpy.random.rand()
        wave = numpy.concatenate(
            [
                numpy.sin(
                    (2 * numpy.pi)
                    * (
                        numpy.arange(sampling_length // len(f0), dtype=numpy.float32)
                        * numpy.exp(one_f0)
                        / sampling_rate
                        + rand
                    )
                )
                for one_f0 in f0.tolist()
            ]
        )
        Wave(wave=wave, sampling_rate=sampling_rate).save(
            wave_dir.joinpath(f"{i_data}.wav")
        )

        silence = numpy.zeros_like(wave).astype(bool)
        SamplingData(array=silence, rate=sampling_rate).save(
            silence_dir.joinpath(f"{i_data}.npy")
        )

    speaker_dict = defaultdict(list)
    for i_data in range(data_num):
        speaker_dict[str(i_data % speaker_size)].append(str(i_data))
    json.dump(speaker_dict, dataset_directory.joinpath("speaker_dict.json").open("w"))