def convert_worker( config: Config, wrapper: VoiceChangerStreamWrapper, acoustic_converter: AcousticConverter, super_resolution: SuperResolution, audio_config: AudioConfig, queue_input: Queue, queue_output: Queue, ): wrapper.voice_changer_stream.voice_changer = VoiceChanger( super_resolution=super_resolution, acoustic_converter=acoustic_converter, ) start_time = 0 time_length = audio_config.convert_chunk / audio_config.rate while True: in_feature: AcousticFeatureWrapper = queue_input.get() wrapper.voice_changer_stream.add_in_feature( start_time=start_time, feature_wrapper=in_feature, frame_period=audio_config.frame_period, ) start_time += time_length out_feature = wrapper.convert_next(time_length=time_length) queue_output.put(out_feature)
def voice_changer(self): if self._voice_changer is None: acoustic_converter, super_resolution = self.models self._voice_changer = VoiceChanger( acoustic_converter=acoustic_converter, super_resolution=super_resolution, output_sampling_rate=self.out_sampling_rate, ) return self._voice_changer
def convert_worker( acoustic_converter: AcousticConverter, super_resolution: SuperResolution, time_length: float, extra_time: float, input_silent_threshold: float, queue_input: Queue, queue_output: Queue, acquired_lock: Lock, ): logger = logging.getLogger('convert') init_logger(logger) logging.info('convert worker') chainer.global_config.enable_backprop = False chainer.global_config.train = False stream = ConvertStream(voice_changer=VoiceChanger( super_resolution=super_resolution, acoustic_converter=acoustic_converter, threshold=input_silent_threshold, )) stream_wrapper = StreamWrapper(stream=stream, extra_time=extra_time) acquired_lock.release() start_time = extra_time while True: item: Item = queue_input.get() start = time.time() in_feature: AcousticFeatureWrapper = item.item stream.add( start_time=start_time, data=in_feature, ) start_time += time_length out_feature = stream_wrapper.process_next(time_length=time_length) item.item = out_feature queue_output.put(item) logger.debug(f'{item.index}: {time.time() - start}')
def check( input_path: Path, input_time_length: int, output_path: Path, input_statistics_path: Path, target_statistics_path: Path, stage1_model_path: Path, stage1_config_path: Path, stage2_model_path: Path, stage2_config_path: Path, ): ac_config = create_config(stage1_config_path) sr_config = create_sr_config(stage2_config_path) input_rate = ac_config.dataset.acoustic_param.sampling_rate output_rate = sr_config.dataset.param.voice_param.sample_rate realtime_vocoder = RealtimeVocoder( acoustic_param=ac_config.dataset.acoustic_param, out_sampling_rate=output_rate, extract_f0_mode=VocodeMode.WORLD, ) realtime_vocoder.create_synthesizer( buffer_size=1024, number_of_pointers=16, ) f0_converter = F0Converter( input_statistics=input_statistics_path, target_statistics=target_statistics_path, ) ac_config = ac_config sr_config = sr_config acoustic_converter = AcousticConverter( ac_config, stage1_model_path, f0_converter=f0_converter, out_sampling_rate=output_rate, ) super_resolution = SuperResolution( sr_config, stage2_model_path, ) voice_changer = VoiceChanger( acoustic_converter=acoustic_converter, super_resolution=super_resolution, output_sampling_rate=output_rate, ) encode_stream = EncodeStream(vocoder=realtime_vocoder) convert_stream = ConvertStream(voice_changer=voice_changer) decode_stream = DecodeStream(vocoder=realtime_vocoder) num_data = input_time_length time_length = 1 def _load_wave_and_split(time_length: float = 1): length = round(time_length * input_rate) wave, _ = librosa.load(str(input_path), sr=input_rate) return [ wave[i * length:(i + 1) * length] for i in range(len(wave) // length) ] def _add(_stream: BaseStream, _datas): for i, data in zip(range(num_data), _datas): _stream.add(start_time=i * time_length, data=data) def _split_process(_stream: BaseStream, _extra_time: float): return [ _stream.process(start_time=i * time_length, time_length=time_length, extra_time=_extra_time) for i in range(num_data) ] def _join_process(_stream: BaseStream, _extra_time: float): return _stream.process(start_time=0, time_length=time_length * num_data, extra_time=_extra_time) def _process_all_stream( _streams: Tuple[BaseStream, BaseStream, BaseStream], _datas, _split_flags: Tuple[bool, bool, bool], _extra_times: Tuple[float, float, float], ): for stream, split_flag, extra_time in zip(_streams, _split_flags, _extra_times): _add(stream, _datas) if split_flag: _datas = _split_process(stream, _extra_time=extra_time) else: _datas = [_join_process(stream, _extra_time=extra_time)] return _datas def _concat_and_save(_waves, _path: Path): wave = numpy.concatenate(_waves).astype(numpy.float32) librosa.output.write_wav(str(_path), wave, output_rate) def _remove(_streams: Tuple[BaseStream, BaseStream, BaseStream]): for stream in _streams: stream.remove(end_time=num_data) waves = _load_wave_and_split(time_length=time_length)[:num_data] encode_stream = encode_stream convert_stream = convert_stream decode_stream = decode_stream streams = (encode_stream, convert_stream, decode_stream) datas = _process_all_stream(streams, waves, _split_flags=(True, True, True), _extra_times=(0, 1, 0)) _concat_and_save(datas, output_path) _remove(streams)
rate=config.dataset.acoustic_param.sampling_rate, chunk=config.dataset.acoustic_param.sampling_rate, vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16, out_norm=4.5, ) frame_period = config.dataset.acoustic_param.frame_period vocoder = RealtimeVocoder( acoustic_param=config.dataset.acoustic_param, out_sampling_rate=audio_config.rate, buffer_size=audio_config.vocoder_buffer_size, number_of_pointers=16, ) voice_changer = VoiceChanger( super_resolution=super_resolution, acoustic_converter=acoustic_converter, ) voice_changer_stream = VoiceChangerStream( sampling_rate=audio_config.rate, frame_period=acoustic_converter._param.frame_period, order=acoustic_converter._param.order, in_dtype=numpy.float32, ) voice_changer_stream.voice_changer = voice_changer voice_changer_stream.vocoder = vocoder wrapper = VoiceChangerStreamWrapper( voice_changer_stream=voice_changer_stream, extra_time_pre=0.2,