def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), f0=SamplingData.load(self.path_f0), phoneme=SamplingData.load(self.path_phoneme), )
def __getitem__(self, i: int): sampling_rate = self.sampling_rate length = self.sampling_length frequency = numpy.random.uniform(self.frequency_range[0], self.frequency_range[1]) rand = numpy.random.rand() wave = numpy.sin( (2 * numpy.pi) * (numpy.arange(length, dtype=numpy.float32) * frequency / sampling_rate + rand)) local = numpy.log( numpy.ones(shape=(length // self.local_scale, 1), dtype=numpy.float32) * frequency) silence = numpy.zeros(shape=(length, ), dtype=numpy.bool) return default_convert( self.make_input( wave_data=Wave(wave=wave, sampling_rate=sampling_rate), silence_data=SamplingData(array=silence, rate=sampling_rate), local_data=SamplingData(array=local, rate=sampling_rate // self.local_scale), ))
def test_extract_input_with_dataset( sampling_length: int, f0_path: Path, phoneme_path: Path, phoneme_list_path: Path, silence_path: Path, spectrogram_path: Path, volume_path: Path, f0_process_mode: F0ProcessMode, time_mask_max_second: float, time_mask_num: int, ): f0 = SamplingData.load(f0_path) phoneme = SamplingData.load(phoneme_path) phoneme_list = JvsPhoneme.load_julius_list(phoneme_list_path) silence = SamplingData.load(silence_path) spectrogram = SamplingData.load(spectrogram_path) volume_data = SamplingData.load(volume_path) FeatureDataset.extract_input( sampling_length=sampling_length, f0_data=f0, phoneme_data=phoneme, spec_data=spectrogram, silence_data=silence, phoneme_list_data=phoneme_list, volume_data=volume_data, f0_process_mode=f0_process_mode, time_mask_max_second=time_mask_max_second, time_mask_num=time_mask_num, )
def test_extract_input(sampling_length: int, data_length: int, padding_length: int): silence_data = SamplingData(array=numpy.zeros(data_length, dtype=bool), rate=1) spectrogram_data = SamplingData( array=numpy.linspace(start=1, stop=2, num=data_length)[:, numpy.newaxis], rate=1, ) for _ in range(100): spectrogram = extract_input( sampling_length=sampling_length, spectrogram_data=spectrogram_data, silence_data=silence_data, min_not_silence_length=min(sampling_length, data_length), padding_length=padding_length, padding_value=numpy.nan, )["spectrogram"] assert len(spectrogram) == sampling_length + padding_length * 2 if sampling_length <= data_length: assert numpy.isnan(spectrogram).sum() <= padding_length * 2 else: assert (numpy.isnan(spectrogram).sum() == sampling_length - data_length + padding_length * 2) if padding_length == 0: data = spectrogram else: data = spectrogram[padding_length:-padding_length] assert (~numpy.isnan(data)).sum() >= min(sampling_length, data_length)
def test_convert_to_dict(self): sampling_rate = 800 local_sampling_rate = 200 scale = sampling_rate // local_sampling_rate time_length = 10 sampling_length = 16 wave_data = Wave( wave=numpy.linspace( 0, sampling_rate * time_length, sampling_rate * time_length, endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=numpy.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=numpy.linspace( 0, sampling_rate * time_length, local_sampling_rate * time_length, endpoint=False, ), rate=local_sampling_rate, ) wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) dataset = BaseWaveDataset( sampling_rate=sampling_rate, sampling_length=sampling_length, bit=10, mulaw=False, wave_random_max_second=0, wave_random_num=0, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) d = dataset.convert_input(wave, silence, local) self.assertEqual(len(d["coarse"]), sampling_length) self.assertEqual(len(d["encoded_coarse"]), sampling_length) self.assertEqual(len(d["silence"]), sampling_length - 1) self.assertEqual(len(d["local"]), sampling_length // scale)
def test_extract_input(self): for sampling_rate, local_sampling_rate, sampling_length, time_length in [ [800, 200, 16, 10], [24000, 24000 / 256, 1024, 100], ]: with self.subTest( sampling_rate=sampling_rate, local_sampling_rate=local_sampling_rate, sampling_length=sampling_length, time_length=time_length, ): scale = sampling_rate // local_sampling_rate wave_data = Wave( wave=numpy.linspace( 0, int(sampling_rate * time_length), int(sampling_rate * time_length), endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=numpy.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=numpy.linspace( 0, int(sampling_rate * time_length), int(local_sampling_rate * time_length), endpoint=False, ), rate=local_sampling_rate, ) for _ in range(10): wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) self.assertEqual(len(wave), sampling_length) self.assertEqual(len(silence), sampling_length) self.assertEqual(len(local), sampling_length // scale) wave_as_local = wave.reshape(int(sampling_length // scale), -1).min(axis=1) self.assertTrue(numpy.all(wave_as_local == local))
def generate(self): return Input( f0=SamplingData.load(self.f0_path), phoneme=SamplingData.load(self.phoneme_path), spec=SamplingData.load(self.spec_path), silence=SamplingData.load(self.silence_path), phoneme_list=(self.phoneme_class.load_julius_list( self.phoneme_list_path) if self.phoneme_list_path is not None else None), volume=(SamplingData.load(self.volume_path) if self.volume_path is not None else None), )
def setUp(self): waves = [ np.ones(self.num // 2) * -1, np.ones(self.num // 2), ] self.inputs = [ Input( wave=Wave(wave=w, sampling_rate=self.sampling_rate), local=SamplingData(array=np.empty((len(w), 0)), rate=self.sampling_rate), silence=SamplingData(array=np.zeros((len(w), ), dtype=bool), rate=self.sampling_rate), ) for w in waves ]
def generate_dataset( dataset_directory: Path, data_num: int, f0_rate: int, phoneme_rate: int, phoneme_size: int, speaker_size: int, ): if dataset_directory.exists(): for p in dataset_directory.rglob("*"): if not p.is_dir(): p.unlink() else: dataset_directory.mkdir() f0_dir = dataset_directory.joinpath("f0") phoneme_dir = dataset_directory.joinpath("phoneme") phoneme_list_dir = dataset_directory.joinpath("phoneme_list") f0_dir.mkdir(exist_ok=True) phoneme_dir.mkdir(exist_ok=True) phoneme_list_dir.mkdir(exist_ok=True) speaker_dict = defaultdict(list) for i_data in range(data_num): speaker_num = i_data % speaker_size speaker_dict[str(speaker_num)].append(str(i_data)) source_length = int(numpy.random.randint(low=10, high=20)) phoneme_list = numpy.random.randint(low=0, high=phoneme_size, size=source_length, dtype=numpy.int32) phoneme_list_dir.joinpath(f"{i_data}.lab").write_text("\n".join( [f"0 0 {JvsPhoneme.phoneme_list[p]}" for p in phoneme_list])) f0 = phoneme_list.astype(numpy.float32) / 10 + 0.2 + speaker_num / 100 f0 = numpy.repeat(f0, (phoneme_list + 1) * (f0_rate // phoneme_rate)) f0[::5] = 0 SamplingData(array=f0, rate=f0_rate).save(f0_dir.joinpath(f"{i_data}.npy")) phoneme = numpy.repeat(phoneme_list, phoneme_list + 1) phoneme = numpy.identity(phoneme_size, dtype=numpy.int32)[phoneme] SamplingData(array=phoneme, rate=phoneme_rate).save( phoneme_dir.joinpath(f"{i_data}.npy")) json.dump(speaker_dict, dataset_directory.joinpath("speaker_dict.json").open("w"))
def convert_f0( model_config: Path, input_glob: str, input_f0_statistics: Path, target_f0_statistics: Path, output_dir: Path, ): output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", convert_f0, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) input_stat = numpy.load(input_f0_statistics, allow_pickle=True).item() target_stat = numpy.load(target_f0_statistics, allow_pickle=True).item() paths = list(map(Path, glob(input_glob))) for p in tqdm(paths, desc="convert_f0"): data = SamplingData.load(p) if data.array.shape[1] == (config.network.voiced_feature_size + 1 + config.network.phoneme_feature_size): f0_index = config.network.voiced_feature_size elif data.array.shape[1] == (1 + 1 + 40): f0_index = 1 else: raise ValueError(data.array.shape[1]) data.array[:, f0_index] += target_stat["mean"] - input_stat["mean"] data.save(output_dir / (p.stem + ".npy"))
def process_wo_context( local_paths: Sequence[Path], speaker_nums: Optional[Sequence[int]], generator: Generator, postfix="_woc", ): try: local_datas = [ SamplingData.load(local_path) for local_path in local_paths ] size = int((time_length + 5) * local_datas[0].rate) local_arrays = [ local_data.array[:size] if len(local_data.array) >= size else np.pad( local_data.array, ((0, size - len(local_data.array)), (0, 0)), mode="edge", ) for local_data in local_datas ] waves = generator.generate( time_length=time_length, sampling_policy=sampling_policy, num_generate=len(local_arrays), local_array=np.stack(local_arrays), speaker_nums=speaker_nums, ) for wave, local_path in zip(waves, local_paths): wave.save(output_dir / (local_path.stem + postfix + ".wav")) except: import traceback traceback.print_exc()
def __getitem__(self, i): data = self.datas[i] input = SamplingData.load(data.input_path).array vowel = numpy.squeeze(SamplingData.load(data.vowel_path).array) speaker_num = data.speaker_num assert len(vowel) <= len( input), f'{data.input_path.stem} cannot be processed.' if abs(len(vowel) - len(input)) >= 10: warn(f'{data.input_path.stem} is not matched.') input_vowel = input[:len(vowel)][vowel] i = numpy.random.randint(len(input_vowel)) return default_convert(dict( input=input_vowel[i], target=speaker_num, ))
def generate_and_save_data( feature_dir: Path, silence_dir: Path, wavelength: float, exponent: float, amplitude: float, length=300, ): feature, silence = generate_data( wavelength=wavelength, exponent=exponent, amplitude=amplitude, length=length, ) filename = f"{wavelength}_{exponent}_{amplitude}.npy" SamplingData(array=feature, rate=100).save(feature_dir / filename) SamplingData(array=silence, rate=100).save(silence_dir / filename)
def test_extract_input(): sampling_length = 10 wave_length = 256 * sampling_length wave_rate = 24000 second = wave_length / wave_rate f0_rate = 200 phoneme_rate = 100 spec_rate = wave_rate / 256 silence_rate = 24000 f0 = numpy.arange(int(second * f0_rate)).reshape(-1, 1).astype(numpy.float32) f0_data = SamplingData(array=f0, rate=f0_rate) phoneme = (numpy.arange(int(second * phoneme_rate)).reshape(-1, 1).astype( numpy.float32)) phoneme_data = SamplingData(array=phoneme, rate=phoneme_rate) spec = numpy.arange(int(second * spec_rate)).reshape(-1, 1).astype( numpy.float32) spec_data = SamplingData(array=spec, rate=spec_rate) silence = numpy.zeros(int(second * silence_rate)).astype(bool) silence_data = SamplingData(array=silence, rate=silence_rate) phoneme_list_data = None volume_data = None f0_process_mode = F0ProcessMode.normal time_mask_max_second = 0 time_mask_num = 0 FeatureDataset.extract_input( sampling_length=sampling_length, f0_data=f0_data, phoneme_data=phoneme_data, spec_data=spec_data, silence_data=silence_data, phoneme_list_data=phoneme_list_data, volume_data=volume_data, f0_process_mode=f0_process_mode, time_mask_max_second=time_mask_max_second, time_mask_num=time_mask_num, )
def process_local_data(local_paths: Sequence[Path], time_length: float): local_datas = [SamplingData.load(local_path) for local_path in local_paths] size = int((time_length + 1) * local_datas[0].rate) local_arrays = [ local_data.array[:size] if len(local_data.array) >= size else np.pad( local_data.array, ((0, size - len(local_data.array)), (0, 0)), mode="edge", ) for local_data in local_datas ] return local_arrays
def process(args: Tuple[int, Path], sampling_lengths: Sequence[int]): i_data, path = args vector = numpy.empty(len(sampling_lengths), dtype=numpy.int32) data = SamplingData.load(path) array = ~numpy.squeeze(data.array) for i_length, sampling_length in enumerate(sampling_lengths): m = numpy.convolve(numpy.ones(sampling_length, dtype=numpy.int32), array, mode='valid').max() vector[i_length] = m return i_data, vector
def generate(self): return Input( phoneme_list=self.phoneme_class.load_julius_list( self.phoneme_list_path), start_accent_list=numpy.array([ bool(int(s)) for s in self.start_accent_list_path.read_text().split() ]), end_accent_list=numpy.array([ bool(int(s)) for s in self.end_accent_list_path.read_text().split() ]), start_accent_phrase_list=numpy.array([ bool(int(s)) for s in self.start_accent_phrase_list_path.read_text().split() ]), end_accent_phrase_list=numpy.array([ bool(int(s)) for s in self.end_accent_phrase_list_path.read_text().split() ]), f0=SamplingData.load(self.f0_path), volume=(SamplingData.load(self.volume_path) if self.volume_path is not None else None), )
def process( generator: Generator, local_paths: Sequence[Path], local_sampling_rate: Optional[int], time_length: float, speaker_nums: Optional[Sequence[int]], sampling_policy: SamplingPolicy, output_dir: Path, postfix="", ): local_datas = [SamplingData.load(local_path) for local_path in local_paths] if local_sampling_rate is None: rate = local_datas[0].rate local_arrays = [l.array for l in local_datas] else: rate = local_sampling_rate local_arrays = [l.resample(rate) for l in local_datas] size = int((time_length + 5) * local_datas[0].rate) local_arrays = [ l[:size] if len(l) >= size else numpy.pad( l, ((0, size - len(l)), (0, 0)), mode="edge", ) for l in local_arrays ] waves = generator.generate( time_length=time_length, sampling_policy=sampling_policy, num_generate=len(local_arrays), local_array=numpy.stack(local_arrays), speaker_nums=speaker_nums, ) for wave, local_path in zip(waves, local_paths): wave.save(output_dir / (local_path.stem + postfix + ".wav"))
def generate(self): wave = Wave.load(self.path_wave) try: local = SamplingData.load(self.path_local) except: local_rate = 80 local_array = to_log_melspectrogram(wave=wave, rate=local_rate) local = SamplingData(array=local_array, rate=local_rate) with NamedTemporaryFile(suffix=".npy", delete=False) as f: self.path_local = Path(f.name) local.save(self.path_local) return Input( wave=wave, silence=SamplingData.load(self.path_silence), local=local, )
def main(): model_dir: Path = arguments.model_dir model_iteration: int = arguments.model_iteration model_config: Path = arguments.model_config time_length: float = arguments.time_length gpu: int = arguments.gpu config = create_config(model_config) model_path = _get_predictor_model_path(model_dir, model_iteration) sr = config.dataset.sampling_rate model = create_predictor(config.model) chainer.serializers.load_npz(str(model_path), model) if gpu is not None: model.to_gpu(gpu) cuda.get_device_from_id(gpu).use() chainer.global_config.train = False chainer.global_config.enable_backprop = False wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))]) local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))]) assert len(wave_paths) == len(local_paths) np.random.RandomState(config.dataset.seed).shuffle(wave_paths) np.random.RandomState(config.dataset.seed).shuffle(local_paths) wave_path = wave_paths[0] local_path = local_paths[0] w_data = Wave.load(wave_path, sampling_rate=sr) l_data = SamplingData.load(local_path) length = int(sr * time_length) l_scale = int(sr // l_data.rate) l_sl = length // l_scale length = l_sl * l_scale w = w_data.wave[:length] l = l_data.array[:l_sl] coarse, fine = encode_16bit(w) c, f, hc, hf = model( c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis], f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis], l_array=model.xp.asarray(l)[np.newaxis], ) c = chainer.functions.softmax(c) c = chainer.cuda.to_cpu(c[0].data) f = chainer.cuda.to_cpu(f[0].data) fig = plt.figure(figsize=[32 * time_length, 10]) plt.imshow(c, aspect='auto', interpolation='nearest') plt.colorbar() plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true') plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted') plt.legend() fig.savefig('output.eps')
def test_extract_input_with_local_padding(self): for sampling_rate, local_sampling_rate, sampling_length, time_length, local_padding_size in [ [800, 200, 16, 1, 100], [24000, 24000 / 256, 1024, 4, 1024], ]: with self.subTest( sampling_rate=sampling_rate, local_sampling_rate=local_sampling_rate, sampling_length=sampling_length, time_length=time_length, local_padding_size=local_padding_size, ): scale = sampling_rate // local_sampling_rate wave_data = Wave( wave=np.linspace( 0, int(sampling_rate * time_length), int(sampling_rate * time_length), endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=np.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=np.linspace( 0, int(sampling_rate * time_length), int(local_sampling_rate * time_length), endpoint=False, ), rate=local_sampling_rate, ) for _ in range(10000): wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_padding_size=local_padding_size, padding_value=np.nan, ) self.assertEqual(len(wave), sampling_length) self.assertEqual(len(silence), sampling_length) self.assertEqual( len(local), (sampling_length + local_padding_size * 2) // scale) num_pad = np.isnan(local).sum() self.assertLessEqual(num_pad, local_padding_size) self.assertTrue(not np.isnan(local[0]) or not np.isnan(local[-1])) wave_as_local = wave.reshape(int(sampling_length // scale), -1).min(axis=1) pad = int(local_padding_size // scale) local_wo_pad = local[pad:-pad] self.assertTrue(np.all(wave_as_local == local_wo_pad))
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, local_data: SamplingData, local_padding_length: int, min_not_silence_length: int, f0_index: int, volume_index: Optional[int], harmonic_num: int, only_noise_source: bool, padding_value=0, ): """ :return: wave: (sampling_length, ) silence: (sampling_length, ) local: (sampling_length // scale + pad, ) """ sr = wave_data.sampling_rate sl = sampling_length assert sr % local_data.rate == 0 l_scale = int(sr // local_data.rate) length = len(local_data.array) * l_scale assert (abs(length - len(wave_data.wave)) < l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}" assert local_padding_length % l_scale == 0 l_pad = local_padding_length // l_scale l_length = length // l_scale l_sl = sl // l_scale for _ in range(10000): if l_length > l_sl: l_offset = numpy.random.randint(l_length - l_sl) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze( silence_data.resample(sr, index=offset, length=sl)) if (~silence).sum() >= min_not_silence_length: break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] # local l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad if l_start < 0 or l_end > l_length: shape = list(local_data.array.shape) shape[0] = l_sl + l_pad * 2 local = (numpy.ones(shape=shape, dtype=local_data.array.dtype) * padding_value) if l_start < 0: p_start = -l_start l_start = 0 else: p_start = 0 if l_end > l_length: p_end = l_sl + l_pad * 2 - (l_end - l_length) l_end = l_length else: p_end = l_sl + l_pad * 2 local[p_start:p_end] = local_data.array[l_start:l_end] else: local = local_data.array[l_start:l_end] # source module if l_pad > 0: log_f0 = local[l_pad:-l_pad, f0_index] else: log_f0 = local[:, f0_index] if only_noise_source: log_f0 = numpy.zeros_like(log_f0) volume = None if volume_index is not None: if l_pad > 0: volume = local[l_pad:-l_pad, volume_index] else: volume = local[:, volume_index] source, signal = generate_source( log_f0=log_f0, volume=volume, local_rate=int(local_data.rate), sampling_rate=sr, harmonic_num=harmonic_num, ) source2, _ = generate_source( log_f0=log_f0, volume=volume, local_rate=int(local_data.rate), sampling_rate=sr, harmonic_num=harmonic_num, ) return dict( wave=wave, silence=silence, local=local, source=source, source2=source2, signal=signal, )
def test_convert_to_dict(self): sampling_rate = 800 local_sampling_rate = 200 scale = sampling_rate // local_sampling_rate time_length = 10 sampling_length = 16 wave_data = Wave( wave=np.linspace(0, sampling_rate * time_length, sampling_rate * time_length, endpoint=False), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=np.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=np.linspace(0, sampling_rate * time_length, local_sampling_rate * time_length, endpoint=False), rate=local_sampling_rate, ) wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_padding_size=0, ) dataset = BaseWaveDataset( sampling_length=sampling_length, to_double=True, bit=16, mulaw=False, local_padding_size=0, ) d = dataset.convert_to_dict(wave, silence, local) self.assertEqual(len(d['coarse']), sampling_length) self.assertEqual(len(d['fine']), sampling_length - 1) self.assertEqual(len(d['encoded_coarse']), sampling_length) self.assertEqual(len(d['encoded_fine']), sampling_length) self.assertEqual(len(d['silence']), sampling_length - 1) self.assertEqual(len(d['local']), sampling_length // scale) dataset = BaseWaveDataset( sampling_length=sampling_length, to_double=False, bit=10, mulaw=False, local_padding_size=0, ) d = dataset.convert_to_dict(wave, silence, local) self.assertEqual(len(d['coarse']), sampling_length) self.assertIsNone(d['fine']) self.assertEqual(len(d['encoded_coarse']), sampling_length) self.assertIsNone(d['encoded_fine']) self.assertEqual(len(d['silence']), sampling_length - 1) self.assertEqual(len(d['local']), sampling_length // scale)
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, local_data: SamplingData, local_sampling_rate: Optional[int], local_padding_size: int, local_mask_max_second: float, local_mask_num: int, padding_value=0, ): """ :return: wave: (sampling_length, ) silence: (sampling_length, ) local: (sampling_length // scale + pad, ) """ sr = wave_data.sampling_rate sl = sampling_length if local_sampling_rate is None: l_rate = local_data.rate l_array = local_data.array else: l_rate = local_sampling_rate l_array = local_data.resample(l_rate) l_scale = int(round(sr / l_rate)) length = min(len(l_array) * l_scale, len(wave_data.wave)) assert abs(length - len(l_array) * l_scale) < l_scale * 4 assert abs(length - len(wave_data.wave)) < l_scale * 4 assert ( local_padding_size % l_scale == 0 ), f"local_padding_size: {local_padding_size}, l_scale: {l_scale}" l_pad = local_padding_size // l_scale l_length = length // l_scale l_sl = sl // l_scale for _ in range(10000): if l_length > l_sl + 1: l_offset = numpy.random.randint(l_length - l_sl + 1) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze(silence_data.resample(sr, index=offset, length=sl)) if not silence.all(): break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset : offset + sl] # local l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad if l_start < 0 or l_end > l_length: shape = list(l_array.shape) shape[0] = l_sl + l_pad * 2 local = numpy.ones(shape=shape, dtype=l_array.dtype) * padding_value if l_start < 0: p_start = -l_start l_start = 0 else: p_start = 0 if l_end > l_length: p_end = l_sl + l_pad * 2 - (l_end - l_length) l_end = l_length else: p_end = l_sl + l_pad * 2 local[p_start:p_end] = l_array[l_start:l_end] else: local = l_array[l_start:l_end] if local_mask_max_second > 0 and local_mask_num > 0: for _ in range(local_mask_num): mask_length = numpy.random.randint(int(l_rate * local_mask_max_second)) mask_offset = numpy.random.randint(len(local) - mask_length + 1) local[mask_offset : mask_offset + mask_length] = 0 return wave, silence, local
def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), local=SamplingData.load(self.path_local), )
def generate(self): return InputData( spectrogram=SamplingData.load(str(TempCache( self.spectrogram_path))), silence=SamplingData.load(str(TempCache(self.silence_path))), )
def create_data( f0_dir: Path, phoneme_list_dir: Path, loudness_dir: Path, accent_start_dir: Path, accent_end_dir: Path, accent_phrase_start_dir: Path, accent_phrase_end_dir: Path, speaker_valid_filter: Optional[str], utterance_valid_filter: Optional[str], data_num: Optional[int], ): f0_paths = sorted(f0_dir.rglob("*.npy")) if data_num is not None: f0_paths = f0_paths[:data_num] assert len(f0_paths) > 0 phoneme_list_paths = sorted(phoneme_list_dir.rglob("*.lab")) if data_num is not None: phoneme_list_paths = phoneme_list_paths[:data_num] assert len(f0_paths) == len(phoneme_list_paths) loudness_paths = sorted(loudness_dir.rglob("*.npy")) if data_num is not None: loudness_paths = loudness_paths[:data_num] assert len(f0_paths) == len(loudness_paths) accent_start_paths = sorted(accent_start_dir.rglob("*.txt")) if data_num is not None: accent_start_paths = accent_start_paths[:data_num] assert len(f0_paths) == len(accent_start_paths) accent_end_paths = sorted(accent_end_dir.rglob("*.txt")) if data_num is not None: accent_end_paths = accent_end_paths[:data_num] assert len(f0_paths) == len(accent_end_paths) accent_phrase_start_paths = sorted(accent_phrase_start_dir.rglob("*.txt")) if data_num is not None: accent_phrase_start_paths = accent_phrase_start_paths[:data_num] assert len(f0_paths) == len(accent_phrase_start_paths) accent_phrase_end_paths = sorted(accent_phrase_end_dir.rglob("*.txt")) if data_num is not None: accent_phrase_end_paths = accent_phrase_end_paths[:data_num] assert len(f0_paths) == len(accent_phrase_end_paths) datas = [ InputData( name=f0_path.stem, f0=SamplingData.load(f0_path), phoneme_list=JvsPhoneme.load_julius_list(phoneme_list_path), loudness=SamplingData.load(loudness_path), accent_start=[ bool(int(s)) for s in accent_start_path.read_text().split() ], accent_end=[ bool(int(s)) for s in accent_end_path.read_text().split() ], accent_phrase_start=[ bool(int(s)) for s in accent_phrase_start_path.read_text().split() ], accent_phrase_end=[ bool(int(s)) for s in accent_phrase_end_path.read_text().split() ], ) for ( f0_path, phoneme_list_path, loudness_path, accent_start_path, accent_end_path, accent_phrase_start_path, accent_phrase_end_path, ) in zip( f0_paths, phoneme_list_paths, loudness_paths, accent_start_paths, accent_end_paths, accent_phrase_start_paths, accent_phrase_end_paths, ) ] train_datas: List[InputData] = [] valid_datas: List[InputData] = [] for d in datas: if (speaker_valid_filter is not None and speaker_valid_filter in d.name) or (utterance_valid_filter is not None and utterance_valid_filter in d.name): valid_datas.append(d) else: train_datas.append(d) return train_datas, valid_datas
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, f0_data: SamplingData, phoneme_data: SamplingData, min_not_silence_length: int, with_mic_augment: bool, time_mask_max_second: float, time_mask_num: int, ): rate = wave_data.sampling_rate sl = sampling_length l_rate = max(f0_data.rate, phoneme_data.rate) assert rate % l_rate == 0 l_scale = int(rate // l_rate) assert sl % l_scale == 0 local = SamplingData.collect([f0_data, phoneme_data], rate=l_rate, mode="min", error_time_length=0.015) f0_array = local[:, 0] phoneme_array = local[:, 1:] assert numpy.abs(len(local) * l_scale - len(wave_data.wave)) < l_scale * 4 length = min( len(local) * l_scale, len(wave_data.wave) // l_scale * l_scale) if sl > length: pad = sl - length sl = length else: pad = 0 l_length = length // l_scale l_sl = sl // l_scale l_pad = pad // l_scale for _ in range(10000): if l_length > l_sl: l_offset = numpy.random.randint(l_length - l_sl) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze( silence_data.resample(rate, index=offset, length=sl)) if (~silence).sum() >= min_not_silence_length: break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] f0 = numpy.squeeze(f0_array[l_offset:l_offset + l_sl]) phoneme = numpy.argmax(phoneme_array[l_offset:l_offset + l_sl], axis=1) padded = numpy.zeros_like(f0, dtype=bool) if l_pad > 0: l_pre = numpy.random.randint(l_pad + 1) l_post = l_pad - l_pre f0 = numpy.pad(f0, [l_pre, l_post]) phoneme = numpy.pad(phoneme, [l_pre, l_post]) padded = numpy.pad(padded, [l_pre, l_post], constant_values=True) pre, post = int(l_pre * l_scale), int(l_post * l_scale) wave = numpy.pad(wave, [pre, post]) if with_mic_augment: wave = mic_augment(wave, sampling_rate=rate) if time_mask_max_second > 0 and time_mask_num > 0: for _ in range(time_mask_num): mask_length = numpy.random.randint( int(wave_data.sampling_rate * time_mask_max_second)) mask_offset = numpy.random.randint(len(wave) - mask_length + 1) wave[mask_offset:mask_offset + mask_length] = 0 return dict( wave=wave, f0=f0, phoneme=phoneme, padded=padded, )
def generate( model_dir: Path, model_iteration: Optional[int], model_config: Optional[Path], output_dir: Path, to_voiced_scaler: bool, to_f0_scaler: bool, to_phoneme_onehot: bool, batch_size: Optional[int], num_test: int, target_glob: Optional[str], use_gpu: bool, ): if model_config is None: model_config = model_dir / "config.yaml" output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", generate, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) generator = Generator( config=config, predictor=_get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="predictor_", ), voiced_network=( None if not to_voiced_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="voiced_network_", ) ), f0_network=( None if not to_f0_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="f0_network_", ) ), phoneme_network=( None if not to_phoneme_onehot else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="phoneme_network_", ) ), use_gpu=use_gpu, ) dataset = create_dataset(config.dataset)["test"] scale = numpy.prod(config.network.scale_list) if batch_size is None: batch_size = config.train.batch_size if isinstance(dataset, SpeakerWavesDataset): wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]] elif isinstance(dataset, WavesDataset): wave_paths = [data.path_wave for data in dataset.inputs[:num_test]] else: raise Exception() if target_glob is not None: wave_paths += list(map(Path, glob(target_glob))) for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"): waves = [Wave.load(p) for p in wps] arrays = [w.wave for w in waves] pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays] arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays] tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays] output = generator.generate( wave=concat_examples(tensors), to_voiced_scaler=to_voiced_scaler, to_f0_scaler=to_f0_scaler, to_phoneme_onehot=to_phoneme_onehot, ) for feature, p, w, l in zip(output, wps, waves, pad_lengths): feature = feature.T[: l // scale] data = SamplingData(array=feature, rate=w.sampling_rate // scale) data.save(output_dir / (p.stem + ".npy"))
def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, local_data: SamplingData, local_sampling_rate: Optional[int], local_padding_length: int, min_not_silence_length: int, mulaw: bool, padding_value=0, ): """ :return: wave: (sampling_length, ) local: (sampling_length // scale + pad, ) """ sr = wave_data.sampling_rate sl = sampling_length if local_sampling_rate is None: l_rate = local_data.rate l_array = local_data.array else: l_rate = local_sampling_rate l_array = local_data.resample(l_rate) assert sr % l_rate == 0 l_scale = int(sr // l_rate) length = len(l_array) * l_scale assert (abs(length - len(wave_data.wave)) < l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}" assert local_padding_length % l_scale == 0 l_pad = local_padding_length // l_scale l_length = length // l_scale l_sl = sl // l_scale for _ in range(10000): if l_length > l_sl: l_offset = numpy.random.randint(l_length - l_sl) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze( silence_data.resample(sr, index=offset, length=sl)) if (~silence).sum() >= min_not_silence_length: break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] if mulaw: wave = encode_mulaw(wave) # local l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad if l_start < 0 or l_end > l_length: shape = list(l_array.shape) shape[0] = l_sl + l_pad * 2 local = numpy.ones(shape=shape, dtype=l_array.dtype) * padding_value if l_start < 0: p_start = -l_start l_start = 0 else: p_start = 0 if l_end > l_length: p_end = l_sl + l_pad * 2 - (l_end - l_length) l_end = l_length else: p_end = l_sl + l_pad * 2 local[p_start:p_end] = l_array[l_start:l_end] else: local = l_array[l_start:l_end] return dict( wave=wave, local=local.T, # (C, T) )