def encode_worker( config: Config, wrapper: VoiceChangerStreamWrapper, audio_config: AudioConfig, queue_input: Queue, queue_output: Queue, ): wrapper.voice_changer_stream.vocoder = Vocoder( acoustic_param=config.dataset.acoustic_param, out_sampling_rate=audio_config.out_rate, ) start_time = 0 time_length = audio_config.in_audio_chunk / audio_config.in_rate # padding 1s prev_original = numpy.zeros(round(time_length * audio_config.in_rate), dtype=numpy.float32) w = Wave(wave=prev_original, sampling_rate=audio_config.in_rate) wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) start_time += time_length while True: item: Item = queue_input.get() item.original, prev_original = prev_original, item.original wave = item.item w = Wave(wave=wave, sampling_rate=audio_config.in_rate) wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) start_time += time_length feature_wrapper = wrapper.pre_convert_next(time_length=time_length) item.item = feature_wrapper queue_output.put(item)
def separate_effective(self, wave: Wave, feature: AcousticFeature, threshold=None): """ :return: (effective feature, effective flags) """ hop, length = wave.get_hop_and_length( frame_period=self._param.frame_period) if threshold is None: if self._param.threshold_db is not None: effective = wave.get_effective_frame( threshold_db=self._param.threshold_db, fft_length=self._param.fft_length, frame_period=self._param.frame_period, ) feature = feature.indexing(effective) else: effective = numpy.ones(length, dtype=bool) else: mse = librosa.feature.rmse(y=wave.wave, frame_length=self._param.fft_length, hop_length=hop)**2 effective = (librosa.core.power_to_db(mse.squeeze()) > -threshold) if len(effective) < len(feature.f0): # the divide move effective = numpy.r_[effective, False] if len(effective) > len(feature.f0): # the divide move effective = effective if len(effective) < len(feature.f0): # the divide move effective = numpy.r_[effective, False] if len(effective) > len(feature.f0): # the divide move effective = effective feature = feature.indexing(effective) return feature, effective
def encode_worker( config: Config, wrapper: VoiceChangerStreamWrapper, audio_config: AudioConfig, queue_input: Queue, queue_output: Queue, ): wrapper.voice_changer_stream.vocoder = Vocoder( acoustic_param=config.dataset.acoustic_param, out_sampling_rate=audio_config.rate, ) start_time = 0 time_length = audio_config.convert_chunk / audio_config.rate z = numpy.zeros(round(time_length * audio_config.rate), dtype=numpy.float32) w = Wave(wave=z, sampling_rate=audio_config.rate) wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) start_time += time_length while True: wave = queue_input.get() w = Wave(wave=wave, sampling_rate=audio_config.rate) wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=w) start_time += time_length feature_wrapper = wrapper.pre_convert_next(time_length=time_length) queue_output.put(feature_wrapper)
def generate_feature(path: Path): out = Path(arguments.output, path.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # load wave and padding wave = Wave.load(path=path, sampling_rate=arguments.sampling_rate) wave = wave.pad(pre_second=arguments.pad_second, post_second=arguments.pad_second) # make acoustic feature feature = AcousticFeature.extract( wave=wave, frame_period=arguments.frame_period, f0_floor=arguments.f0_floor, f0_ceil=arguments.f0_ceil, fft_length=arguments.fft_length, order=arguments.order, alpha=arguments.alpha, dtype=arguments.dtype, ) if arguments.threshold_db is not None: if arguments.sampling_rate_for_thresholding is not None: wave_ref = Wave.load( path=path, sampling_rate=arguments.sampling_rate_for_thresholding) wave_ref = wave_ref.pad(pre_second=arguments.pad_second, post_second=arguments.pad_second) else: wave_ref = wave effective = wave_ref.get_effective_frame( threshold_db=arguments.threshold_db, fft_length=arguments.fft_length, frame_period=arguments.frame_period, ) # there is possibility mismatch of length # https://github.com/mmorise/World/blob/c41e580c24c8d360f322ba6e2092ad4785d2d5b9/src/harvest.cpp#L1220 len_wave = wave.get_hop_and_length(arguments.frame_period)[1] len_wave_ref = wave_ref.get_hop_and_length(arguments.frame_period)[1] if len_wave == len_wave_ref - 1: effective = effective[:-1] feature = feature.indexing(effective) # save feature.save(path=out, ignores=arguments.ignore_feature)
def generate_feature(path: Path): out = Path(arguments.output, path.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # load wave and padding wave = Wave.load(path=path, sampling_rate=arguments.sampling_rate) wave = wave.pad(pre_second=arguments.pad_second, post_second=arguments.pad_second) # make acoustic feature feature = AcousticFeature.extract( wave=wave, frame_period=arguments.frame_period, f0_floor=arguments.f0_floor, f0_ceil=arguments.f0_ceil, fft_length=arguments.fft_length, order=arguments.order, alpha=arguments.alpha, dtype=arguments.dtype, ) if arguments.threshold_db is not None: index = wave.get_effective_frame( threshold_db=arguments.threshold_db, fft_length=arguments.fft_length, frame_period=arguments.frame_period, ) feature = feature.indexing(index) # save feature.save(path=out, validate=True, ignores=arguments.ignore_feature)
def concatenate_wrapper(fs: List['AcousticFeatureWrapper'], keys: Iterable[str]): return AcousticFeatureWrapper( wave=Wave(wave=numpy.concatenate([f.wave.wave for f in fs]), sampling_rate=fs[0].wave.sampling_rate), **AcousticFeatureWrapper.concatenate(fs, keys=keys).__dict__, )
def decode_acoustic_feature(self, feature: AcousticFeature): out = pyworld.synthesize( f0=feature.f0.ravel(), spectrogram=feature.sp, aperiodicity=feature.ap, fs=self.out_sampling_rate, frame_period=self._param.frame_period, ) return Wave(out, sampling_rate=self.out_sampling_rate)
def pick_wrapper(self, first: int, last: int, keys: Iterable[str], frame_period: float): first_wave = round(first * frame_period / 1000 * self.wave.sampling_rate) last_wave = round(last * frame_period / 1000 * self.wave.sampling_rate) return AcousticFeatureWrapper( wave=Wave(wave=self.wave.wave[first_wave:last_wave], sampling_rate=self.wave.sampling_rate), **self.pick(first, last, keys=keys).__dict__, )
def decode( self, acoustic_feature: AcousticFeature, ): assert self._synthesizer is not None length = len(acoustic_feature.f0) f0_buffer = utils.cast_1d_list_to_1d_pointer( acoustic_feature.f0.flatten().tolist()) sp_buffer = utils.cast_2d_list_to_2d_pointer( acoustic_feature.sp.tolist()) ap_buffer = utils.cast_2d_list_to_2d_pointer( acoustic_feature.ap.tolist()) apidefinitions._AddParameters(f0_buffer, length, sp_buffer, ap_buffer, self._synthesizer) ys = [] while apidefinitions._Synthesis2(self._synthesizer) != 0: y = numpy.array([ self._synthesizer.buffer[i] for i in range(self._synthesizer.buffer_size) ]) ys.append(y) if len(ys) > 0: out_wave = Wave( wave=numpy.concatenate(ys), sampling_rate=self.out_sampling_rate, ) else: out_wave = Wave( wave=numpy.empty(0), sampling_rate=self.out_sampling_rate, ) self._before_buffer.append( (f0_buffer, sp_buffer, ap_buffer)) # for holding memory if len(self._before_buffer) > 16: self._before_buffer.pop(0) return out_wave
def decode( self, acoustic_feature: AcousticFeature, ): acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) out = pyworld.synthesize( f0=acoustic_feature.f0.ravel(), spectrogram=acoustic_feature.spectrogram, aperiodicity=acoustic_feature.aperiodicity, fs=self.out_sampling_rate, frame_period=self.acoustic_param.frame_period, ) return Wave(out, sampling_rate=self.out_sampling_rate)
def process(self, start_time: float, time_length: float, extra_time: float) -> AcousticFeatureWrapper: wave = self.fetch( start_time=start_time, time_length=time_length, extra_time=extra_time, ) wave = Wave(wave=wave, sampling_rate=self.in_segment_method.sampling_rate) feature_wrapper = self.vocoder.encode(wave) pad = round(extra_time * self.out_segment_method.sampling_rate) if pad > 0: feature_wrapper = self.out_segment_method.pick(feature_wrapper, pad, -pad) return feature_wrapper
def pre_convert(self, start_time: float, time_length: float, extra_time: float): keys = ['f0', 'ap', 'mc', 'voiced'] wave = self.fetch( start_time=start_time, time_length=time_length, extra_time=extra_time, data_stream=self._data_stream, rate=self.sampling_rate, pad_function=lambda length: numpy.zeros(shape=length, dtype=self.in_dtype), pick_function=lambda segment, first, last: segment.wave.wave[first:last], concat_function=numpy.concatenate, ) in_wave = Wave(wave=wave, sampling_rate=self.sampling_rate) in_feature = self.vocoder.encode(in_wave) pad = round(extra_time * self.sampling_rate) in_wave.wave = in_wave.wave[pad:-pad] pad = round(extra_time / (self.vocoder.acoustic_param.frame_period / 1000)) in_feature = in_feature.pick(pad, -pad, keys=keys) feature_wrapper = AcousticFeatureWrapper(wave=in_wave, **in_feature.__dict__) return feature_wrapper
def silent_wrapper( length: int, sizes: Dict[str, int], keys: Iterable[str], frame_period: float, sampling_rate: int, wave_dtype, ): length_wave = round(length * frame_period / 1000 * sampling_rate) return AcousticFeatureWrapper( wave=Wave(wave=numpy.zeros(shape=length_wave, dtype=wave_dtype), sampling_rate=sampling_rate), **AcousticFeatureWrapper.silent(length, sizes=sizes, keys=keys).__dict__, )
def post_convert(self, start_time: float, time_length: float): sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order) keys = ['f0', 'ap', 'sp', 'voiced'] out_feature = self.fetch( start_time=start_time, time_length=time_length, data_stream=self._out_feature_stream, rate=1000 / self.frame_period, pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys), pick_function=lambda segment, first, last: segment.feature.pick(first, last, keys=keys), concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys), ) out_wave = self.vocoder.decode( acoustic_feature=out_feature, ) w = out_wave.wave w[numpy.isnan(w)] = 0 out_wave = Wave(wave=w, sampling_rate=out_wave.sampling_rate) return out_wave
def astype_only_float_wrapper(self, dtype): return AcousticFeatureWrapper( wave=Wave(wave=self.wave.wave.astype(dtype), sampling_rate=self.wave.sampling_rate), **self.astype_only_float(dtype).__dict__, )
voice_changer_stream.voice_changer = voice_changer voice_changer_stream.vocoder = vocoder wrapper = VoiceChangerStreamWrapper( voice_changer_stream=voice_changer_stream, extra_time_pre=0.2, extra_time=0.1, ) raw_wave, _ = librosa.load(str(test_data_path), sr=audio_config.rate) wave_out_list = [] start_time = 0 for i in range(0, len(raw_wave), audio_config.chunk): wave_in = Wave(wave=raw_wave[i:i + audio_config.chunk], sampling_rate=audio_config.rate) wrapper.voice_changer_stream.add_wave(start_time=start_time, wave=wave_in) start_time += len(wave_in.wave) / wave_in.sampling_rate start_time = 0 for i in range(len(raw_wave) // audio_config.chunk + 1): feature_in = wrapper.pre_convert_next(time_length=audio_config.chunk / audio_config.rate) wrapper.voice_changer_stream.add_in_feature( start_time=start_time, feature_wrapper=feature_in, frame_period=frame_period, ) start_time += audio_config.chunk / audio_config.rate print('pre', i, flush=True) start_time = 0
def warm_up(self, time_length: float): y = numpy.zeros(int(time_length * self.out_sampling_rate)) w = Wave(wave=y, sampling_rate=self.out_sampling_rate) f = self.encode(w) self.decode(f)
def load_wave(self, path: Path): return Wave.load(path, sampling_rate=self._param.sampling_rate)