Exemple #1
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        f0_data: SamplingData,
        phoneme_data: SamplingData,
        min_not_silence_length: int,
        with_mic_augment: bool,
        time_mask_max_second: float,
        time_mask_num: int,
    ):
        rate = wave_data.sampling_rate
        sl = sampling_length

        l_rate = max(f0_data.rate, phoneme_data.rate)

        assert rate % l_rate == 0
        l_scale = int(rate // l_rate)

        assert sl % l_scale == 0

        local = SamplingData.collect([f0_data, phoneme_data],
                                     rate=l_rate,
                                     mode="min",
                                     error_time_length=0.015)
        f0_array = local[:, 0]
        phoneme_array = local[:, 1:]

        assert numpy.abs(len(local) * l_scale -
                         len(wave_data.wave)) < l_scale * 4

        length = min(
            len(local) * l_scale,
            len(wave_data.wave) // l_scale * l_scale)

        if sl > length:
            pad = sl - length
            sl = length
        else:
            pad = 0

        l_length = length // l_scale
        l_sl = sl // l_scale
        l_pad = pad // l_scale

        for _ in range(10000):
            if l_length > l_sl:
                l_offset = numpy.random.randint(l_length - l_sl)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(
                silence_data.resample(rate, index=offset, length=sl))
            if (~silence).sum() >= min_not_silence_length:
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]
        f0 = numpy.squeeze(f0_array[l_offset:l_offset + l_sl])
        phoneme = numpy.argmax(phoneme_array[l_offset:l_offset + l_sl], axis=1)
        padded = numpy.zeros_like(f0, dtype=bool)

        if l_pad > 0:
            l_pre = numpy.random.randint(l_pad + 1)
            l_post = l_pad - l_pre
            f0 = numpy.pad(f0, [l_pre, l_post])
            phoneme = numpy.pad(phoneme, [l_pre, l_post])
            padded = numpy.pad(padded, [l_pre, l_post], constant_values=True)

            pre, post = int(l_pre * l_scale), int(l_post * l_scale)
            wave = numpy.pad(wave, [pre, post])

        if with_mic_augment:
            wave = mic_augment(wave, sampling_rate=rate)

        if time_mask_max_second > 0 and time_mask_num > 0:
            for _ in range(time_mask_num):
                mask_length = numpy.random.randint(
                    int(wave_data.sampling_rate * time_mask_max_second))
                mask_offset = numpy.random.randint(len(wave) - mask_length + 1)
                wave[mask_offset:mask_offset + mask_length] = 0

        return dict(
            wave=wave,
            f0=f0,
            phoneme=phoneme,
            padded=padded,
        )
Exemple #2
0
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)):
    with TemporaryDirectory() as d:
        tmp_dir = Path(d)
        input_audio_path = tmp_dir.joinpath("input.wav")
        input_audio_path.write_bytes(await wave.read())

        # openjtalk
        phonemes = [
            p.label
            for p in openjtalk_label_getter(
                text,
                openjtalk_command="open_jtalk",
                dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"),
                htsvoice_path=Path(
                    "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice"
                ),
                output_wave_path=tmp_dir.joinpath("wave.wav"),
                output_log_path=tmp_dir.joinpath("log.txt"),
                output_type=OutputType.phoneme,
                without_span=False,
            )
        ]

        # julius
        julius_audio_path = tmp_dir.joinpath("julius.wav")
        subprocess.check_call(
            f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split()
        )

        julius_phonemes = [
            p if p not in _jvs_to_julius else _jvs_to_julius[p]
            for p in phonemes
            if p != "sil"
        ]

        julius_dict_path = tmp_dir.joinpath("2nd.dict")
        julius_dict = sp_inserter.gen_julius_dict_2nd(
            " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm
        )
        julius_dict_path.write_text(julius_dict)

        julius_dfa_path = tmp_dir.joinpath("2nd.dfa")
        julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n"))
        julius_dfa_path.write_text(julius_dfa)

        julius_output = sp_inserter.julius_phone_alignment(
            str(julius_audio_path),
            str(tmp_dir.joinpath("2nd")),
            _hmm_model,
            model_type=sp_inserter.ModelType.gmm,
            options=None,
        )

        time_alignment_list = sp_inserter.frame_to_second(
            sp_inserter.get_time_alimented_list(julius_output)
        )

        i_phoneme = 0
        new_phonemes = []
        for p in phonemes:
            if p == "pau" and time_alignment_list[i_phoneme][2] != "sp":
                continue
            i_phoneme += 1
            new_phonemes.append(p)

        aligned = JvsPhoneme.convert(
            [
                JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p)
                for p, o in zip(new_phonemes, time_alignment_list)
            ]
        )
        for p in aligned:
            p.verify()

        # world
        f0 = F0.from_wave(
            Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64),
            frame_period=5.0,
            f0_floor=71.0,
            f0_ceil=800,
            with_vuv=False,
            f0_type=F0Type.world,
        )
        converted_f0 = f0.convert(
            input_mean=f0.valid_f0_log.mean(),
            input_var=f0.valid_f0_log.var(),
            target_mean=_voiro_mean,
            target_var=f0.valid_f0_log.var(),
        )
        converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1)

        # feature
        phoneme_array = LinguisticFeature(
            phonemes=aligned,
            phoneme_class=JvsPhoneme,
            rate=_feature_rate,
            feature_types=[LinguisticFeature.FeatureType.PHONEME],
        ).make_array()

        phoneme = SamplingData(array=phoneme_array, rate=_feature_rate)

        feature = SamplingData.collect(
            [converted_f0, phoneme],
            rate=_feature_rate,
            mode="min",
            error_time_length=0.015,
        )

    return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))