def __getitem__(self, i: int): sampling_rate = self.sampling_rate length = self.sampling_length frequency = numpy.random.uniform(self.frequency_range[0], self.frequency_range[1]) rand = numpy.random.rand() wave = numpy.sin( (2 * numpy.pi) * (numpy.arange(length, dtype=numpy.float32) * frequency / sampling_rate + rand)) local = numpy.log( numpy.ones(shape=(length // self.local_scale, 1), dtype=numpy.float32) * frequency) silence = numpy.zeros(shape=(length, ), dtype=numpy.bool) return default_convert( self.make_input( wave_data=Wave(wave=wave, sampling_rate=sampling_rate), silence_data=SamplingData(array=silence, rate=sampling_rate), local_data=SamplingData(array=local, rate=sampling_rate // self.local_scale), ))
def test_extract_input(sampling_length: int, data_length: int, padding_length: int): silence_data = SamplingData(array=numpy.zeros(data_length, dtype=bool), rate=1) spectrogram_data = SamplingData( array=numpy.linspace(start=1, stop=2, num=data_length)[:, numpy.newaxis], rate=1, ) for _ in range(100): spectrogram = extract_input( sampling_length=sampling_length, spectrogram_data=spectrogram_data, silence_data=silence_data, min_not_silence_length=min(sampling_length, data_length), padding_length=padding_length, padding_value=numpy.nan, )["spectrogram"] assert len(spectrogram) == sampling_length + padding_length * 2 if sampling_length <= data_length: assert numpy.isnan(spectrogram).sum() <= padding_length * 2 else: assert (numpy.isnan(spectrogram).sum() == sampling_length - data_length + padding_length * 2) if padding_length == 0: data = spectrogram else: data = spectrogram[padding_length:-padding_length] assert (~numpy.isnan(data)).sum() >= min(sampling_length, data_length)
def test_convert_to_dict(self): sampling_rate = 800 local_sampling_rate = 200 scale = sampling_rate // local_sampling_rate time_length = 10 sampling_length = 16 wave_data = Wave( wave=numpy.linspace( 0, sampling_rate * time_length, sampling_rate * time_length, endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=numpy.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=numpy.linspace( 0, sampling_rate * time_length, local_sampling_rate * time_length, endpoint=False, ), rate=local_sampling_rate, ) wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) dataset = BaseWaveDataset( sampling_rate=sampling_rate, sampling_length=sampling_length, bit=10, mulaw=False, wave_random_max_second=0, wave_random_num=0, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) d = dataset.convert_input(wave, silence, local) self.assertEqual(len(d["coarse"]), sampling_length) self.assertEqual(len(d["encoded_coarse"]), sampling_length) self.assertEqual(len(d["silence"]), sampling_length - 1) self.assertEqual(len(d["local"]), sampling_length // scale)
def test_extract_input(self): for sampling_rate, local_sampling_rate, sampling_length, time_length in [ [800, 200, 16, 10], [24000, 24000 / 256, 1024, 100], ]: with self.subTest( sampling_rate=sampling_rate, local_sampling_rate=local_sampling_rate, sampling_length=sampling_length, time_length=time_length, ): scale = sampling_rate // local_sampling_rate wave_data = Wave( wave=numpy.linspace( 0, int(sampling_rate * time_length), int(sampling_rate * time_length), endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=numpy.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=numpy.linspace( 0, int(sampling_rate * time_length), int(local_sampling_rate * time_length), endpoint=False, ), rate=local_sampling_rate, ) for _ in range(10): wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_sampling_rate=local_sampling_rate, local_padding_size=0, local_mask_max_second=0, local_mask_num=0, ) self.assertEqual(len(wave), sampling_length) self.assertEqual(len(silence), sampling_length) self.assertEqual(len(local), sampling_length // scale) wave_as_local = wave.reshape(int(sampling_length // scale), -1).min(axis=1) self.assertTrue(numpy.all(wave_as_local == local))
def setUp(self): waves = [ np.ones(self.num // 2) * -1, np.ones(self.num // 2), ] self.inputs = [ Input( wave=Wave(wave=w, sampling_rate=self.sampling_rate), local=SamplingData(array=np.empty((len(w), 0)), rate=self.sampling_rate), silence=SamplingData(array=np.zeros((len(w), ), dtype=bool), rate=self.sampling_rate), ) for w in waves ]
def generate_dataset( dataset_directory: Path, data_num: int, f0_rate: int, phoneme_rate: int, phoneme_size: int, speaker_size: int, ): if dataset_directory.exists(): for p in dataset_directory.rglob("*"): if not p.is_dir(): p.unlink() else: dataset_directory.mkdir() f0_dir = dataset_directory.joinpath("f0") phoneme_dir = dataset_directory.joinpath("phoneme") phoneme_list_dir = dataset_directory.joinpath("phoneme_list") f0_dir.mkdir(exist_ok=True) phoneme_dir.mkdir(exist_ok=True) phoneme_list_dir.mkdir(exist_ok=True) speaker_dict = defaultdict(list) for i_data in range(data_num): speaker_num = i_data % speaker_size speaker_dict[str(speaker_num)].append(str(i_data)) source_length = int(numpy.random.randint(low=10, high=20)) phoneme_list = numpy.random.randint(low=0, high=phoneme_size, size=source_length, dtype=numpy.int32) phoneme_list_dir.joinpath(f"{i_data}.lab").write_text("\n".join( [f"0 0 {JvsPhoneme.phoneme_list[p]}" for p in phoneme_list])) f0 = phoneme_list.astype(numpy.float32) / 10 + 0.2 + speaker_num / 100 f0 = numpy.repeat(f0, (phoneme_list + 1) * (f0_rate // phoneme_rate)) f0[::5] = 0 SamplingData(array=f0, rate=f0_rate).save(f0_dir.joinpath(f"{i_data}.npy")) phoneme = numpy.repeat(phoneme_list, phoneme_list + 1) phoneme = numpy.identity(phoneme_size, dtype=numpy.int32)[phoneme] SamplingData(array=phoneme, rate=phoneme_rate).save( phoneme_dir.joinpath(f"{i_data}.npy")) json.dump(speaker_dict, dataset_directory.joinpath("speaker_dict.json").open("w"))
def generate_and_save_data( feature_dir: Path, silence_dir: Path, wavelength: float, exponent: float, amplitude: float, length=300, ): feature, silence = generate_data( wavelength=wavelength, exponent=exponent, amplitude=amplitude, length=length, ) filename = f"{wavelength}_{exponent}_{amplitude}.npy" SamplingData(array=feature, rate=100).save(feature_dir / filename) SamplingData(array=silence, rate=100).save(silence_dir / filename)
def test_extract_input(): sampling_length = 10 wave_length = 256 * sampling_length wave_rate = 24000 second = wave_length / wave_rate f0_rate = 200 phoneme_rate = 100 spec_rate = wave_rate / 256 silence_rate = 24000 f0 = numpy.arange(int(second * f0_rate)).reshape(-1, 1).astype(numpy.float32) f0_data = SamplingData(array=f0, rate=f0_rate) phoneme = (numpy.arange(int(second * phoneme_rate)).reshape(-1, 1).astype( numpy.float32)) phoneme_data = SamplingData(array=phoneme, rate=phoneme_rate) spec = numpy.arange(int(second * spec_rate)).reshape(-1, 1).astype( numpy.float32) spec_data = SamplingData(array=spec, rate=spec_rate) silence = numpy.zeros(int(second * silence_rate)).astype(bool) silence_data = SamplingData(array=silence, rate=silence_rate) phoneme_list_data = None volume_data = None f0_process_mode = F0ProcessMode.normal time_mask_max_second = 0 time_mask_num = 0 FeatureDataset.extract_input( sampling_length=sampling_length, f0_data=f0_data, phoneme_data=phoneme_data, spec_data=spec_data, silence_data=silence_data, phoneme_list_data=phoneme_list_data, volume_data=volume_data, f0_process_mode=f0_process_mode, time_mask_max_second=time_mask_max_second, time_mask_num=time_mask_num, )
def generate(self): wave = Wave.load(self.path_wave) try: local = SamplingData.load(self.path_local) except: local_rate = 80 local_array = to_log_melspectrogram(wave=wave, rate=local_rate) local = SamplingData(array=local_array, rate=local_rate) with NamedTemporaryFile(suffix=".npy", delete=False) as f: self.path_local = Path(f.name) local.save(self.path_local) return Input( wave=wave, silence=SamplingData.load(self.path_silence), local=local, )
def test_extract_input_with_local_padding(self): for sampling_rate, local_sampling_rate, sampling_length, time_length, local_padding_size in [ [800, 200, 16, 1, 100], [24000, 24000 / 256, 1024, 4, 1024], ]: with self.subTest( sampling_rate=sampling_rate, local_sampling_rate=local_sampling_rate, sampling_length=sampling_length, time_length=time_length, local_padding_size=local_padding_size, ): scale = sampling_rate // local_sampling_rate wave_data = Wave( wave=np.linspace( 0, int(sampling_rate * time_length), int(sampling_rate * time_length), endpoint=False, ), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=np.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=np.linspace( 0, int(sampling_rate * time_length), int(local_sampling_rate * time_length), endpoint=False, ), rate=local_sampling_rate, ) for _ in range(10000): wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_padding_size=local_padding_size, padding_value=np.nan, ) self.assertEqual(len(wave), sampling_length) self.assertEqual(len(silence), sampling_length) self.assertEqual( len(local), (sampling_length + local_padding_size * 2) // scale) num_pad = np.isnan(local).sum() self.assertLessEqual(num_pad, local_padding_size) self.assertTrue(not np.isnan(local[0]) or not np.isnan(local[-1])) wave_as_local = wave.reshape(int(sampling_length // scale), -1).min(axis=1) pad = int(local_padding_size // scale) local_wo_pad = local[pad:-pad] self.assertTrue(np.all(wave_as_local == local_wo_pad))
def test_convert_to_dict(self): sampling_rate = 800 local_sampling_rate = 200 scale = sampling_rate // local_sampling_rate time_length = 10 sampling_length = 16 wave_data = Wave( wave=np.linspace(0, sampling_rate * time_length, sampling_rate * time_length, endpoint=False), sampling_rate=sampling_rate, ) silence_data = SamplingData( array=np.zeros((sampling_rate * time_length, ), dtype=bool), rate=sampling_rate, ) local_data = SamplingData( array=np.linspace(0, sampling_rate * time_length, local_sampling_rate * time_length, endpoint=False), rate=local_sampling_rate, ) wave, silence, local = BaseWaveDataset.extract_input( sampling_length, wave_data=wave_data, silence_data=silence_data, local_data=local_data, local_padding_size=0, ) dataset = BaseWaveDataset( sampling_length=sampling_length, to_double=True, bit=16, mulaw=False, local_padding_size=0, ) d = dataset.convert_to_dict(wave, silence, local) self.assertEqual(len(d['coarse']), sampling_length) self.assertEqual(len(d['fine']), sampling_length - 1) self.assertEqual(len(d['encoded_coarse']), sampling_length) self.assertEqual(len(d['encoded_fine']), sampling_length) self.assertEqual(len(d['silence']), sampling_length - 1) self.assertEqual(len(d['local']), sampling_length // scale) dataset = BaseWaveDataset( sampling_length=sampling_length, to_double=False, bit=10, mulaw=False, local_padding_size=0, ) d = dataset.convert_to_dict(wave, silence, local) self.assertEqual(len(d['coarse']), sampling_length) self.assertIsNone(d['fine']) self.assertEqual(len(d['encoded_coarse']), sampling_length) self.assertIsNone(d['encoded_fine']) self.assertEqual(len(d['silence']), sampling_length - 1) self.assertEqual(len(d['local']), sampling_length // scale)
def generate( model_dir: Path, model_iteration: Optional[int], model_config: Optional[Path], output_dir: Path, to_voiced_scaler: bool, to_f0_scaler: bool, to_phoneme_onehot: bool, batch_size: Optional[int], num_test: int, target_glob: Optional[str], use_gpu: bool, ): if model_config is None: model_config = model_dir / "config.yaml" output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", generate, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) generator = Generator( config=config, predictor=_get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="predictor_", ), voiced_network=( None if not to_voiced_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="voiced_network_", ) ), f0_network=( None if not to_f0_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="f0_network_", ) ), phoneme_network=( None if not to_phoneme_onehot else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="phoneme_network_", ) ), use_gpu=use_gpu, ) dataset = create_dataset(config.dataset)["test"] scale = numpy.prod(config.network.scale_list) if batch_size is None: batch_size = config.train.batch_size if isinstance(dataset, SpeakerWavesDataset): wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]] elif isinstance(dataset, WavesDataset): wave_paths = [data.path_wave for data in dataset.inputs[:num_test]] else: raise Exception() if target_glob is not None: wave_paths += list(map(Path, glob(target_glob))) for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"): waves = [Wave.load(p) for p in wps] arrays = [w.wave for w in waves] pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays] arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays] tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays] output = generator.generate( wave=concat_examples(tensors), to_voiced_scaler=to_voiced_scaler, to_f0_scaler=to_f0_scaler, to_phoneme_onehot=to_phoneme_onehot, ) for feature, p, w, l in zip(output, wps, waves, pad_lengths): feature = feature.T[: l // scale] data = SamplingData(array=feature, rate=w.sampling_rate // scale) data.save(output_dir / (p.stem + ".npy"))
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)): with TemporaryDirectory() as d: tmp_dir = Path(d) input_audio_path = tmp_dir.joinpath("input.wav") input_audio_path.write_bytes(await wave.read()) # openjtalk phonemes = [ p.label for p in openjtalk_label_getter( text, openjtalk_command="open_jtalk", dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"), htsvoice_path=Path( "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice" ), output_wave_path=tmp_dir.joinpath("wave.wav"), output_log_path=tmp_dir.joinpath("log.txt"), output_type=OutputType.phoneme, without_span=False, ) ] # julius julius_audio_path = tmp_dir.joinpath("julius.wav") subprocess.check_call( f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split() ) julius_phonemes = [ p if p not in _jvs_to_julius else _jvs_to_julius[p] for p in phonemes if p != "sil" ] julius_dict_path = tmp_dir.joinpath("2nd.dict") julius_dict = sp_inserter.gen_julius_dict_2nd( " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm ) julius_dict_path.write_text(julius_dict) julius_dfa_path = tmp_dir.joinpath("2nd.dfa") julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n")) julius_dfa_path.write_text(julius_dfa) julius_output = sp_inserter.julius_phone_alignment( str(julius_audio_path), str(tmp_dir.joinpath("2nd")), _hmm_model, model_type=sp_inserter.ModelType.gmm, options=None, ) time_alignment_list = sp_inserter.frame_to_second( sp_inserter.get_time_alimented_list(julius_output) ) i_phoneme = 0 new_phonemes = [] for p in phonemes: if p == "pau" and time_alignment_list[i_phoneme][2] != "sp": continue i_phoneme += 1 new_phonemes.append(p) aligned = JvsPhoneme.convert( [ JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p) for p, o in zip(new_phonemes, time_alignment_list) ] ) for p in aligned: p.verify() # world f0 = F0.from_wave( Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64), frame_period=5.0, f0_floor=71.0, f0_ceil=800, with_vuv=False, f0_type=F0Type.world, ) converted_f0 = f0.convert( input_mean=f0.valid_f0_log.mean(), input_var=f0.valid_f0_log.var(), target_mean=_voiro_mean, target_var=f0.valid_f0_log.var(), ) converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1) # feature phoneme_array = LinguisticFeature( phonemes=aligned, phoneme_class=JvsPhoneme, rate=_feature_rate, feature_types=[LinguisticFeature.FeatureType.PHONEME], ).make_array() phoneme = SamplingData(array=phoneme_array, rate=_feature_rate) feature = SamplingData.collect( [converted_f0, phoneme], rate=_feature_rate, mode="min", error_time_length=0.015, ) return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))
def run(text: str, speaker_id: int): rate = 200 # phoneme utterance = extract_full_context_label(text) # utterance.breath_groups[0].accent_phrases[2].accent = 2 # utterance.breath_groups[1].accent_phrases[1].accent = 6 # utterance.breath_groups[1].accent_phrases[3].accent = 5 x, sr = pyopenjtalk.synthesize(utterance.labels, speed=1, half_tone=0) x /= 2**16 soundfile.write("hiho_openjtalk_wave.wav", x, sr) label_data_list = utterance.phonemes json.dump([p.label for p in label_data_list], open("hiho_label_list.json", mode="w")) is_type1 = False phoneme_str_list = [] start_accent_list = numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan end_accent_list = numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan start_accent_phrase_list = ( numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan) end_accent_phrase_list = ( numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan) for i, label in enumerate(label_data_list): is_end_accent = label.contexts["a1"] == "0" if label.contexts["a2"] == "1": is_type1 = is_end_accent if label.contexts["a2"] == "1" and is_type1: is_start_accent = True elif label.contexts["a2"] == "2" and not is_type1: is_start_accent = True else: is_start_accent = False phoneme_str_list.append(label.phoneme) start_accent_list[i] = is_start_accent end_accent_list[i] = is_end_accent start_accent_phrase_list[i] = label.contexts["a2"] == "1" end_accent_phrase_list[i] = label.contexts["a3"] == "1" start_accent_list = numpy.array(start_accent_list, dtype=numpy.int64) end_accent_list = numpy.array(end_accent_list, dtype=numpy.int64) start_accent_phrase_list = numpy.array(start_accent_phrase_list, dtype=numpy.int64) end_accent_phrase_list = numpy.array(end_accent_phrase_list, dtype=numpy.int64) json.dump(phoneme_str_list, open("hiho_phoneme_list.json", mode="w")) # yukarin_s with open("data/yukarin_s/check-bs128-hs32/config.yaml") as f: d = yaml.safe_load(f) generator_s = GeneratorS( config=ConfigS.from_dict(d), predictor=Path("data/yukarin_s/check-bs128-hs32/predictor_50000.pth"), use_gpu=False, ) phoneme_data_list = [ JvsPhoneme(phoneme=p, start=i, end=i + 1) for i, p in enumerate(phoneme_str_list) ] phoneme_data_list = JvsPhoneme.convert(phoneme_data_list) phoneme_list_s = numpy.array([p.phoneme_id for p in phoneme_data_list]) phoneme_length = generator_s.generate( phoneme_list=phoneme_list_s, speaker_id=speaker_id, ) phoneme_length[0] = phoneme_length[-1] = 0.1 phoneme_length = numpy.round(phoneme_length * rate) / rate numpy.save("hiho_phoneme_length.npy", phoneme_length) # yukarin_sa model_dir = Path( "data/yukarin_sa/withjsss-lr1.0e-03-ehs32-aehs32-pl2-pn8-fl2-fn2-try1") with (model_dir / "config.yaml").open() as f: d = yaml.safe_load(f) generator_sa = GeneratorSa( config=ConfigSa.from_dict(d), predictor=_get_predictor_model_path(model_dir), use_gpu=False, ) assert generator_sa.config.dataset.f0_process_mode == "voiced_mora" ( consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes_data, ) = split_mora(phoneme_data_list) vowel_indexes = numpy.array(vowel_indexes_data) vowel_phoneme_list = numpy.array( [p.phoneme_id for p in vowel_phoneme_data_list]) consonant_phoneme_list = numpy.array([ p.phoneme_id if p is not None else -1 for p in consonant_phoneme_data_list ]) phoneme_length_sa = numpy.array( [a.sum() for a in numpy.split(phoneme_length, vowel_indexes[:-1] + 1)]) f0_list = generator_sa.generate( vowel_phoneme_list=vowel_phoneme_list[numpy.newaxis], consonant_phoneme_list=consonant_phoneme_list[numpy.newaxis], start_accent_list=start_accent_list[vowel_indexes][numpy.newaxis], end_accent_list=end_accent_list[vowel_indexes][numpy.newaxis], start_accent_phrase_list=start_accent_phrase_list[vowel_indexes][ numpy.newaxis], end_accent_phrase_list=end_accent_phrase_list[vowel_indexes][ numpy.newaxis], speaker_id=speaker_id, )[0] for i, p in enumerate(vowel_phoneme_data_list): if p.phoneme in unvoiced_mora_phoneme_list: f0_list[i] = 0 numpy.save("hiho_f0_list.npy", f0_list) phoneme = numpy.repeat( phoneme_list_s, numpy.round(phoneme_length * rate).astype(numpy.int32)) f0 = numpy.repeat( f0_list, numpy.round(phoneme_length_sa * rate).astype(numpy.int32)) numpy.save("hiho_f0.npy", f0) # yukarin_soso with open( "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/config.yaml" ) as f: d = yaml.safe_load(f) generator_soso = GeneratorSoso( config=ConfigSoso.from_dict(d), predictor=Path( "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/predictor_220000.pth" ), use_gpu=False, ) assert generator_soso.config.dataset.f0_process_mode == "voiced_mora_mean" array = numpy.zeros((len(phoneme), JvsPhoneme.num_phoneme), dtype=numpy.float32) array[numpy.arange(len(phoneme)), phoneme] = 1 phoneme = array f0 = SamplingData(array=f0, rate=rate).resample(24000 / 256) phoneme = SamplingData(array=phoneme, rate=rate).resample(24000 / 256) spec = generator_soso.generate( f0=f0[numpy.newaxis, :, numpy.newaxis], phoneme=phoneme[numpy.newaxis], speaker_id=numpy.array(speaker_id).reshape(-1), )[0] numpy.save("hiho_spec.npy", spec) # hifi-gan wave = inference_hifigan( x=spec.T, checkpoint_file="data/hifigan/g_03080000", config_file="data/hifigan/config.json", ) # save soundfile.write("hiho_output.wav", data=wave, samplerate=24000) soundfile.write(f"{text}-{speaker_id}.wav", data=wave, samplerate=24000)
def generate_dataset( dataset_directory: Path, data_num: int, sampling_rate: int, local_rate: int, phoneme_size: int, speaker_size: int, ): if dataset_directory.exists(): for p in dataset_directory.rglob("*"): if not p.is_dir(): p.unlink() else: dataset_directory.mkdir() f0_dir = dataset_directory.joinpath("f0") phoneme_dir = dataset_directory.joinpath("phoneme") wave_dir = dataset_directory.joinpath("wave") silence_dir = dataset_directory.joinpath("silence") f0_dir.mkdir(exist_ok=True) phoneme_dir.mkdir(exist_ok=True) wave_dir.mkdir(exist_ok=True) silence_dir.mkdir(exist_ok=True) for i_data in range(data_num): local_length = int(numpy.random.randint(low=100, high=200)) sampling_length = int(local_length / local_rate * sampling_rate) f0 = numpy.random.rand(local_length, 1).astype(numpy.float32) f0[f0 < 0.2] = 0 f0 *= 7 SamplingData(array=f0, rate=local_rate).save(f0_dir.joinpath(f"{i_data}.npy")) phoneme = numpy.random.randint(0, phoneme_size, size=local_length).astype( numpy.int32 ) phoneme = numpy.identity(phoneme_size)[phoneme].astype(numpy.int32) SamplingData(array=phoneme, rate=local_rate).save( phoneme_dir.joinpath(f"{i_data}.npy") ) rand = numpy.random.rand() wave = numpy.concatenate( [ numpy.sin( (2 * numpy.pi) * ( numpy.arange(sampling_length // len(f0), dtype=numpy.float32) * numpy.exp(one_f0) / sampling_rate + rand ) ) for one_f0 in f0.tolist() ] ) Wave(wave=wave, sampling_rate=sampling_rate).save( wave_dir.joinpath(f"{i_data}.wav") ) silence = numpy.zeros_like(wave).astype(bool) SamplingData(array=silence, rate=sampling_rate).save( silence_dir.joinpath(f"{i_data}.npy") ) speaker_dict = defaultdict(list) for i_data in range(data_num): speaker_dict[str(i_data % speaker_size)].append(str(i_data)) json.dump(speaker_dict, dataset_directory.joinpath("speaker_dict.json").open("w"))