Example #1
0
def convert_worker(
    config: Config,
    wrapper: VoiceChangerStreamWrapper,
    acoustic_converter: AcousticConverter,
    super_resolution: SuperResolution,
    audio_config: AudioConfig,
    queue_input: Queue,
    queue_output: Queue,
):
    wrapper.voice_changer_stream.voice_changer = VoiceChanger(
        super_resolution=super_resolution,
        acoustic_converter=acoustic_converter,
    )

    start_time = 0
    time_length = audio_config.convert_chunk / audio_config.rate
    while True:
        in_feature: AcousticFeatureWrapper = queue_input.get()
        wrapper.voice_changer_stream.add_in_feature(
            start_time=start_time,
            feature_wrapper=in_feature,
            frame_period=audio_config.frame_period,
        )
        start_time += time_length

        out_feature = wrapper.convert_next(time_length=time_length)
        queue_output.put(out_feature)
Example #2
0
 def voice_changer(self):
     if self._voice_changer is None:
         acoustic_converter, super_resolution = self.models
         self._voice_changer = VoiceChanger(
             acoustic_converter=acoustic_converter,
             super_resolution=super_resolution,
             output_sampling_rate=self.out_sampling_rate,
         )
     return self._voice_changer
Example #3
0
def convert_worker(
    acoustic_converter: AcousticConverter,
    super_resolution: SuperResolution,
    time_length: float,
    extra_time: float,
    input_silent_threshold: float,
    queue_input: Queue,
    queue_output: Queue,
    acquired_lock: Lock,
):
    logger = logging.getLogger('convert')
    init_logger(logger)
    logging.info('convert worker')

    chainer.global_config.enable_backprop = False
    chainer.global_config.train = False

    stream = ConvertStream(voice_changer=VoiceChanger(
        super_resolution=super_resolution,
        acoustic_converter=acoustic_converter,
        threshold=input_silent_threshold,
    ))
    stream_wrapper = StreamWrapper(stream=stream, extra_time=extra_time)

    acquired_lock.release()
    start_time = extra_time
    while True:
        item: Item = queue_input.get()
        start = time.time()
        in_feature: AcousticFeatureWrapper = item.item
        stream.add(
            start_time=start_time,
            data=in_feature,
        )
        start_time += time_length

        out_feature = stream_wrapper.process_next(time_length=time_length)
        item.item = out_feature
        queue_output.put(item)

        logger.debug(f'{item.index}: {time.time() - start}')
Example #4
0
def check(
    input_path: Path,
    input_time_length: int,
    output_path: Path,
    input_statistics_path: Path,
    target_statistics_path: Path,
    stage1_model_path: Path,
    stage1_config_path: Path,
    stage2_model_path: Path,
    stage2_config_path: Path,
):
    ac_config = create_config(stage1_config_path)
    sr_config = create_sr_config(stage2_config_path)
    input_rate = ac_config.dataset.acoustic_param.sampling_rate
    output_rate = sr_config.dataset.param.voice_param.sample_rate

    realtime_vocoder = RealtimeVocoder(
        acoustic_param=ac_config.dataset.acoustic_param,
        out_sampling_rate=output_rate,
        extract_f0_mode=VocodeMode.WORLD,
    )
    realtime_vocoder.create_synthesizer(
        buffer_size=1024,
        number_of_pointers=16,
    )

    f0_converter = F0Converter(
        input_statistics=input_statistics_path,
        target_statistics=target_statistics_path,
    )

    ac_config = ac_config
    sr_config = sr_config

    acoustic_converter = AcousticConverter(
        ac_config,
        stage1_model_path,
        f0_converter=f0_converter,
        out_sampling_rate=output_rate,
    )
    super_resolution = SuperResolution(
        sr_config,
        stage2_model_path,
    )

    voice_changer = VoiceChanger(
        acoustic_converter=acoustic_converter,
        super_resolution=super_resolution,
        output_sampling_rate=output_rate,
    )

    encode_stream = EncodeStream(vocoder=realtime_vocoder)
    convert_stream = ConvertStream(voice_changer=voice_changer)
    decode_stream = DecodeStream(vocoder=realtime_vocoder)

    num_data = input_time_length
    time_length = 1

    def _load_wave_and_split(time_length: float = 1):
        length = round(time_length * input_rate)
        wave, _ = librosa.load(str(input_path), sr=input_rate)
        return [
            wave[i * length:(i + 1) * length]
            for i in range(len(wave) // length)
        ]

    def _add(_stream: BaseStream, _datas):
        for i, data in zip(range(num_data), _datas):
            _stream.add(start_time=i * time_length, data=data)

    def _split_process(_stream: BaseStream, _extra_time: float):
        return [
            _stream.process(start_time=i * time_length,
                            time_length=time_length,
                            extra_time=_extra_time) for i in range(num_data)
        ]

    def _join_process(_stream: BaseStream, _extra_time: float):
        return _stream.process(start_time=0,
                               time_length=time_length * num_data,
                               extra_time=_extra_time)

    def _process_all_stream(
        _streams: Tuple[BaseStream, BaseStream, BaseStream],
        _datas,
        _split_flags: Tuple[bool, bool, bool],
        _extra_times: Tuple[float, float, float],
    ):
        for stream, split_flag, extra_time in zip(_streams, _split_flags,
                                                  _extra_times):
            _add(stream, _datas)
            if split_flag:
                _datas = _split_process(stream, _extra_time=extra_time)
            else:
                _datas = [_join_process(stream, _extra_time=extra_time)]
        return _datas

    def _concat_and_save(_waves, _path: Path):
        wave = numpy.concatenate(_waves).astype(numpy.float32)
        librosa.output.write_wav(str(_path), wave, output_rate)

    def _remove(_streams: Tuple[BaseStream, BaseStream, BaseStream]):
        for stream in _streams:
            stream.remove(end_time=num_data)

    waves = _load_wave_and_split(time_length=time_length)[:num_data]
    encode_stream = encode_stream
    convert_stream = convert_stream
    decode_stream = decode_stream

    streams = (encode_stream, convert_stream, decode_stream)

    datas = _process_all_stream(streams,
                                waves,
                                _split_flags=(True, True, True),
                                _extra_times=(0, 1, 0))
    _concat_and_save(datas, output_path)
    _remove(streams)
Example #5
0
    rate=config.dataset.acoustic_param.sampling_rate,
    chunk=config.dataset.acoustic_param.sampling_rate,
    vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
    out_norm=4.5,
)
frame_period = config.dataset.acoustic_param.frame_period

vocoder = RealtimeVocoder(
    acoustic_param=config.dataset.acoustic_param,
    out_sampling_rate=audio_config.rate,
    buffer_size=audio_config.vocoder_buffer_size,
    number_of_pointers=16,
)

voice_changer = VoiceChanger(
    super_resolution=super_resolution,
    acoustic_converter=acoustic_converter,
)

voice_changer_stream = VoiceChangerStream(
    sampling_rate=audio_config.rate,
    frame_period=acoustic_converter._param.frame_period,
    order=acoustic_converter._param.order,
    in_dtype=numpy.float32,
)

voice_changer_stream.voice_changer = voice_changer
voice_changer_stream.vocoder = vocoder

wrapper = VoiceChangerStreamWrapper(
    voice_changer_stream=voice_changer_stream,
    extra_time_pre=0.2,