def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, local_data: SamplingData, local_sampling_rate: Optional[int], local_padding_size: int, local_mask_max_second: float, local_mask_num: int, padding_value=0, ): """ :return: wave: (sampling_length, ) silence: (sampling_length, ) local: (sampling_length // scale + pad, ) """ sr = wave_data.sampling_rate sl = sampling_length if local_sampling_rate is None: l_rate = local_data.rate l_array = local_data.array else: l_rate = local_sampling_rate l_array = local_data.resample(l_rate) l_scale = int(round(sr / l_rate)) length = min(len(l_array) * l_scale, len(wave_data.wave)) assert abs(length - len(l_array) * l_scale) < l_scale * 4 assert abs(length - len(wave_data.wave)) < l_scale * 4 assert ( local_padding_size % l_scale == 0 ), f"local_padding_size: {local_padding_size}, l_scale: {l_scale}" l_pad = local_padding_size // l_scale l_length = length // l_scale l_sl = sl // l_scale for _ in range(10000): if l_length > l_sl + 1: l_offset = numpy.random.randint(l_length - l_sl + 1) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze(silence_data.resample(sr, index=offset, length=sl)) if not silence.all(): break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset : offset + sl] # local l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad if l_start < 0 or l_end > l_length: shape = list(l_array.shape) shape[0] = l_sl + l_pad * 2 local = numpy.ones(shape=shape, dtype=l_array.dtype) * padding_value if l_start < 0: p_start = -l_start l_start = 0 else: p_start = 0 if l_end > l_length: p_end = l_sl + l_pad * 2 - (l_end - l_length) l_end = l_length else: p_end = l_sl + l_pad * 2 local[p_start:p_end] = l_array[l_start:l_end] else: local = l_array[l_start:l_end] if local_mask_max_second > 0 and local_mask_num > 0: for _ in range(local_mask_num): mask_length = numpy.random.randint(int(l_rate * local_mask_max_second)) mask_offset = numpy.random.randint(len(local) - mask_length + 1) local[mask_offset : mask_offset + mask_length] = 0 return wave, silence, local
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, local_data: SamplingData, local_sampling_rate: Optional[int], local_padding_length: int, min_not_silence_length: int, mulaw: bool, padding_value=0, ): """ :return: wave: (sampling_length, ) local: (sampling_length // scale + pad, ) """ sr = wave_data.sampling_rate sl = sampling_length if local_sampling_rate is None: l_rate = local_data.rate l_array = local_data.array else: l_rate = local_sampling_rate l_array = local_data.resample(l_rate) assert sr % l_rate == 0 l_scale = int(sr // l_rate) length = len(l_array) * l_scale assert (abs(length - len(wave_data.wave)) < l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}" assert local_padding_length % l_scale == 0 l_pad = local_padding_length // l_scale l_length = length // l_scale l_sl = sl // l_scale for _ in range(10000): if l_length > l_sl: l_offset = numpy.random.randint(l_length - l_sl) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze( silence_data.resample(sr, index=offset, length=sl)) if (~silence).sum() >= min_not_silence_length: break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] if mulaw: wave = encode_mulaw(wave) # local l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad if l_start < 0 or l_end > l_length: shape = list(l_array.shape) shape[0] = l_sl + l_pad * 2 local = numpy.ones(shape=shape, dtype=l_array.dtype) * padding_value if l_start < 0: p_start = -l_start l_start = 0 else: p_start = 0 if l_end > l_length: p_end = l_sl + l_pad * 2 - (l_end - l_length) l_end = l_length else: p_end = l_sl + l_pad * 2 local[p_start:p_end] = l_array[l_start:l_end] else: local = l_array[l_start:l_end] return dict( wave=wave, local=local.T, # (C, T) )
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, f0_data: SamplingData, phoneme_data: SamplingData, min_not_silence_length: int, with_mic_augment: bool, time_mask_max_second: float, time_mask_num: int, ): rate = wave_data.sampling_rate sl = sampling_length l_rate = max(f0_data.rate, phoneme_data.rate) assert rate % l_rate == 0 l_scale = int(rate // l_rate) assert sl % l_scale == 0 local = SamplingData.collect([f0_data, phoneme_data], rate=l_rate, mode="min", error_time_length=0.015) f0_array = local[:, 0] phoneme_array = local[:, 1:] assert numpy.abs(len(local) * l_scale - len(wave_data.wave)) < l_scale * 4 length = min( len(local) * l_scale, len(wave_data.wave) // l_scale * l_scale) if sl > length: pad = sl - length sl = length else: pad = 0 l_length = length // l_scale l_sl = sl // l_scale l_pad = pad // l_scale for _ in range(10000): if l_length > l_sl: l_offset = numpy.random.randint(l_length - l_sl) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze( silence_data.resample(rate, index=offset, length=sl)) if (~silence).sum() >= min_not_silence_length: break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] f0 = numpy.squeeze(f0_array[l_offset:l_offset + l_sl]) phoneme = numpy.argmax(phoneme_array[l_offset:l_offset + l_sl], axis=1) padded = numpy.zeros_like(f0, dtype=bool) if l_pad > 0: l_pre = numpy.random.randint(l_pad + 1) l_post = l_pad - l_pre f0 = numpy.pad(f0, [l_pre, l_post]) phoneme = numpy.pad(phoneme, [l_pre, l_post]) padded = numpy.pad(padded, [l_pre, l_post], constant_values=True) pre, post = int(l_pre * l_scale), int(l_post * l_scale) wave = numpy.pad(wave, [pre, post]) if with_mic_augment: wave = mic_augment(wave, sampling_rate=rate) if time_mask_max_second > 0 and time_mask_num > 0: for _ in range(time_mask_num): mask_length = numpy.random.randint( int(wave_data.sampling_rate * time_mask_max_second)) mask_offset = numpy.random.randint(len(wave) - mask_length + 1) wave[mask_offset:mask_offset + mask_length] = 0 return dict( wave=wave, f0=f0, phoneme=phoneme, padded=padded, )
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, local_data: SamplingData, local_padding_length: int, min_not_silence_length: int, f0_index: int, volume_index: Optional[int], harmonic_num: int, only_noise_source: bool, padding_value=0, ): """ :return: wave: (sampling_length, ) silence: (sampling_length, ) local: (sampling_length // scale + pad, ) """ sr = wave_data.sampling_rate sl = sampling_length assert sr % local_data.rate == 0 l_scale = int(sr // local_data.rate) length = len(local_data.array) * l_scale assert (abs(length - len(wave_data.wave)) < l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}" assert local_padding_length % l_scale == 0 l_pad = local_padding_length // l_scale l_length = length // l_scale l_sl = sl // l_scale for _ in range(10000): if l_length > l_sl: l_offset = numpy.random.randint(l_length - l_sl) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze( silence_data.resample(sr, index=offset, length=sl)) if (~silence).sum() >= min_not_silence_length: break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] # local l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad if l_start < 0 or l_end > l_length: shape = list(local_data.array.shape) shape[0] = l_sl + l_pad * 2 local = (numpy.ones(shape=shape, dtype=local_data.array.dtype) * padding_value) if l_start < 0: p_start = -l_start l_start = 0 else: p_start = 0 if l_end > l_length: p_end = l_sl + l_pad * 2 - (l_end - l_length) l_end = l_length else: p_end = l_sl + l_pad * 2 local[p_start:p_end] = local_data.array[l_start:l_end] else: local = local_data.array[l_start:l_end] # source module if l_pad > 0: log_f0 = local[l_pad:-l_pad, f0_index] else: log_f0 = local[:, f0_index] if only_noise_source: log_f0 = numpy.zeros_like(log_f0) volume = None if volume_index is not None: if l_pad > 0: volume = local[l_pad:-l_pad, volume_index] else: volume = local[:, volume_index] source, signal = generate_source( log_f0=log_f0, volume=volume, local_rate=int(local_data.rate), sampling_rate=sr, harmonic_num=harmonic_num, ) source2, _ = generate_source( log_f0=log_f0, volume=volume, local_rate=int(local_data.rate), sampling_rate=sr, harmonic_num=harmonic_num, ) return dict( wave=wave, silence=silence, local=local, source=source, source2=source2, signal=signal, )
def extract_input( sampling_length: int, f0_data: SamplingData, phoneme_data: SamplingData, spec_data: SamplingData, silence_data: SamplingData, phoneme_list_data: Optional[List[BasePhoneme]], volume_data: Optional[SamplingData], f0_process_mode: F0ProcessMode, time_mask_max_second: float, time_mask_num: int, ): rate = spec_data.rate f0 = f0_data.resample(rate) phoneme = phoneme_data.resample(rate) silence = silence_data.resample(rate) volume = volume_data.resample( rate) if volume_data is not None else None spec = spec_data.array assert numpy.abs(len(spec) - len(f0)) < 5 assert numpy.abs(len(spec) - len(phoneme)) < 5 assert numpy.abs(len(spec) - len(silence)) < 5 assert volume is None or numpy.abs(len(spec) - len(silence)) < 5 length = min(len(spec), len(f0), len(phoneme), len(silence)) if volume is not None: length = min(length, len(volume)) if f0_process_mode == F0ProcessMode.normal: pass else: assert phoneme_list_data is not None weight = volume if f0_process_mode == F0ProcessMode.phoneme_mean: split_second_list = [p.end for p in phoneme_list_data[:-1]] else: split_second_list = [ p.end for p in phoneme_list_data[:-1] if p.phoneme in mora_phoneme_list ] if f0_process_mode == F0ProcessMode.voiced_mora_mean: if weight is None: weight = numpy.ones_like(f0) for p in phoneme_list_data: if p.phoneme not in voiced_phoneme_list: weight[int(p.start * rate):int(p.end * rate)] = 0 f0 = f0[:length] weight = weight[:length] f0 = f0_mean( f0=f0, rate=rate, split_second_list=split_second_list, weight=weight, ) if sampling_length > length: padding_length = sampling_length - length sampling_length = length else: padding_length = 0 for _ in range(10000): if length > sampling_length + 1: offset = numpy.random.randint(length - sampling_length + 1) else: offset = 0 s = numpy.squeeze(silence[offset:offset + sampling_length]) if not s.all(): break else: raise Exception("cannot pick not silence data") if silence.ndim == 2: silence = numpy.squeeze(silence, axis=1) f0 = f0[offset:offset + sampling_length] phoneme = phoneme[offset:offset + sampling_length] spec = spec[offset:offset + sampling_length] silence = silence[offset:offset + sampling_length] padded = numpy.zeros_like(silence) if padding_length > 0: pre = numpy.random.randint(padding_length + 1) post = padding_length - pre f0 = numpy.pad(f0, [[pre, post], [0, 0]]) phoneme = numpy.pad(phoneme, [[pre, post], [0, 0]]) spec = numpy.pad(spec, [[pre, post], [0, 0]]) silence = numpy.pad(silence, [pre, post], constant_values=True) padded = numpy.pad(padded, [pre, post], constant_values=True) if time_mask_max_second > 0 and time_mask_num > 0: for _ in range(time_mask_num): mask_length = numpy.random.randint( int(rate * time_mask_max_second)) mask_offset = numpy.random.randint(len(f0) - mask_length + 1) f0[mask_offset:mask_offset + mask_length] = 0 phoneme[mask_offset:mask_offset + mask_length] = 0 return dict( f0=f0.astype(numpy.float32), phoneme=phoneme.astype(numpy.float32), spec=spec.astype(numpy.float32), silence=silence, padded=padded, )
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, local_data: SamplingData, local_padding_size: int, padding_value=0, ): """ :return: wave: (sampling_length, ) silence: (sampling_length, ) local: (sampling_length // scale + pad, ) """ sr = wave_data.sampling_rate sl = sampling_length assert sr % local_data.rate == 0 l_scale = int(sr // local_data.rate) length = len(local_data.array) * l_scale assert (abs(length - len(wave_data.wave)) < l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}" assert local_padding_size % l_scale == 0 l_pad = local_padding_size // l_scale l_length = length // l_scale l_sl = sl // l_scale for _ in range(10000): l_offset = np.random.randint(l_length - l_sl) offset = l_offset * l_scale silence = np.squeeze( silence_data.resample(sr, index=offset, length=sl)) if not silence.all(): break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] # local l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad if l_start < 0 or l_end > l_length: shape = list(local_data.array.shape) shape[0] = l_sl + l_pad * 2 local = np.ones(shape=shape, dtype=local_data.array.dtype) * padding_value if l_start < 0: p_start = -l_start l_start = 0 else: p_start = 0 if l_end > l_length: p_end = l_sl + l_pad * 2 - (l_end - l_length) l_end = l_length else: p_end = l_sl + l_pad * 2 local[p_start:p_end] = local_data.array[l_start:l_end] else: local = local_data.array[l_start:l_end] return wave, silence, local