def objective( trial: optuna.Trial, name: str, config_yaml_path: Path, optuna_config_path: Path, root_output: Path, ): with config_yaml_path.open() as f: config = Config.from_dict(yaml.safe_load(f)) config = modify_config(config=config, optuna_config_path=optuna_config_path, trial=trial) postfix = param_dict_to_name(trial.params) config.project.name = f"{name}-" + postfix output = root_output.joinpath(f"{trial.number}-" + config.project.name) trainer = create_trainer(config=config, output=output) trainer.extend( PruningExtension( trial=trial, observation_key=config.train.optuna["key"], pruner_trigger=(config.train.optuna["iteration"], "iteration"), ), ) trainer.run() log_last = trainer.get_extension("LogReport").log[-1] return log_last[config.train.optuna["key"]]
def generate( model_dir: Path, model_iteration: Optional[int], model_config: Optional[Path], time_second: float, num_test: int, output_dir: Path, use_gpu: bool, ): if model_config is None: model_config = model_dir / "config.yaml" output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", generate, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) model_path = _get_predictor_model_path( model_dir=model_dir, iteration=model_iteration, ) generator = Generator( config=config, predictor=model_path, use_gpu=use_gpu, ) sampling_rate = 24000 / 512 config.dataset.sampling_length = int(sampling_rate * time_second) batch_size = config.train.batch_size dataset = create_dataset(config.dataset)["test"] if isinstance(dataset, ConcatDataset): dataset = dataset.datasets[0] if isinstance(dataset.dataset, FeatureDataset): f0_paths = [inp.f0_path for inp in dataset.dataset.inputs[:num_test]] elif isinstance(dataset.dataset, SpeakerFeatureDataset): f0_paths = [ inp.f0_path for inp in dataset.dataset.dataset.inputs[:num_test] ] else: raise ValueError(dataset) for data, f0_path in zip( chunked(tqdm(dataset, desc="generate"), batch_size), chunked(f0_paths, batch_size), ): data = concat_examples(data) specs = generator.generate( f0=data["f0"], phoneme=data["phoneme"], speaker_id=data["speaker_id"] if "speaker_id" in data else None, ) for spec, p in zip(specs, f0_path): numpy.save(output_dir.joinpath(p.stem + ".npy"), spec)
def train( config_yaml_path: Path, output: Path, ): with config_yaml_path.open() as f: config = Config.from_dict(yaml.safe_load(f)) trainer = create_trainer(config=config, output=output) trainer.run()
def extract_fn_list( config_yaml_path: Path, output_train_path: Path, output_test_path: Path, ): with config_yaml_path.open() as f: config = Config.from_dict(yaml.safe_load(f)).dataset f0_paths = {Path(p).stem: Path(p) for p in glob(config.f0_glob)} fn_list = sorted(f0_paths.keys()) assert len(fn_list) > 0 numpy.random.RandomState(config.seed).shuffle(fn_list) test_num = config.test_num trains = fn_list[test_num:] tests = fn_list[:test_num] output_train_path.write_text("\n".join(sorted(trains))) output_test_path.write_text("\n".join(sorted(tests)))
def generate_all( model_dir: Path, model_iteration: Optional[int], model_config: Optional[Path], output_dir: Path, transpose: bool, use_gpu: bool, ): if model_config is None: model_config = model_dir / "config.yaml" output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", generate_all, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) model_path = _get_predictor_model_path( model_dir=model_dir, iteration=model_iteration, ) generator = Generator( config=config, predictor=model_path, use_gpu=use_gpu, ) config.dataset.test_num = 0 dataset = create_dataset(config.dataset)["train"] if isinstance(dataset.dataset, FeatureDataset): inputs = dataset.dataset.inputs speaker_ids = [None] * len(inputs) elif isinstance(dataset.dataset, SpeakerFeatureDataset): inputs = dataset.dataset.dataset.inputs speaker_ids = dataset.dataset.speaker_ids else: raise ValueError(dataset) for input, speaker_id in tqdm(zip(inputs, speaker_ids), total=len(inputs), desc="generate_all"): input_data = input.generate() data = FeatureDataset.extract_input( sampling_length=len(input_data.spec.array), f0_data=input_data.f0, phoneme_data=input_data.phoneme, spec_data=input_data.spec, silence_data=input_data.silence, phoneme_list_data=input_data.phoneme_list, f0_process_mode=F0ProcessMode(config.dataset.f0_process_mode), time_mask_max_second=0, ) spec = generator.generate( f0=data["f0"][numpy.newaxis], phoneme=data["phoneme"][numpy.newaxis], speaker_id=(numpy.array(speaker_id)[numpy.newaxis] if speaker_id is not None else None), )[0] if transpose: spec = spec.T name = input.f0_path.stem numpy.save(output_dir.joinpath(name + ".npy"), spec)
def test_equal_base_config_and_reconstructed(train_config_path: Path): with train_config_path.open() as f: d = yaml.load(f, SafeLoader) base = Config.from_dict(d) base_re = Config.from_dict(base.to_dict()) assert base == base_re
def test_to_dict(train_config_path: Path): with train_config_path.open() as f: d = yaml.load(f, SafeLoader) Config.from_dict(d).to_dict()
def run(text: str, speaker_id: int): rate = 200 # phoneme utterance = extract_full_context_label(text) # utterance.breath_groups[0].accent_phrases[2].accent = 2 # utterance.breath_groups[1].accent_phrases[1].accent = 6 # utterance.breath_groups[1].accent_phrases[3].accent = 5 x, sr = pyopenjtalk.synthesize(utterance.labels, speed=1, half_tone=0) x /= 2**16 soundfile.write("hiho_openjtalk_wave.wav", x, sr) label_data_list = utterance.phonemes json.dump([p.label for p in label_data_list], open("hiho_label_list.json", mode="w")) is_type1 = False phoneme_str_list = [] start_accent_list = numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan end_accent_list = numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan start_accent_phrase_list = ( numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan) end_accent_phrase_list = ( numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan) for i, label in enumerate(label_data_list): is_end_accent = label.contexts["a1"] == "0" if label.contexts["a2"] == "1": is_type1 = is_end_accent if label.contexts["a2"] == "1" and is_type1: is_start_accent = True elif label.contexts["a2"] == "2" and not is_type1: is_start_accent = True else: is_start_accent = False phoneme_str_list.append(label.phoneme) start_accent_list[i] = is_start_accent end_accent_list[i] = is_end_accent start_accent_phrase_list[i] = label.contexts["a2"] == "1" end_accent_phrase_list[i] = label.contexts["a3"] == "1" start_accent_list = numpy.array(start_accent_list, dtype=numpy.int64) end_accent_list = numpy.array(end_accent_list, dtype=numpy.int64) start_accent_phrase_list = numpy.array(start_accent_phrase_list, dtype=numpy.int64) end_accent_phrase_list = numpy.array(end_accent_phrase_list, dtype=numpy.int64) json.dump(phoneme_str_list, open("hiho_phoneme_list.json", mode="w")) # yukarin_s with open("data/yukarin_s/check-bs128-hs32/config.yaml") as f: d = yaml.safe_load(f) generator_s = GeneratorS( config=ConfigS.from_dict(d), predictor=Path("data/yukarin_s/check-bs128-hs32/predictor_50000.pth"), use_gpu=False, ) phoneme_data_list = [ JvsPhoneme(phoneme=p, start=i, end=i + 1) for i, p in enumerate(phoneme_str_list) ] phoneme_data_list = JvsPhoneme.convert(phoneme_data_list) phoneme_list_s = numpy.array([p.phoneme_id for p in phoneme_data_list]) phoneme_length = generator_s.generate( phoneme_list=phoneme_list_s, speaker_id=speaker_id, ) phoneme_length[0] = phoneme_length[-1] = 0.1 phoneme_length = numpy.round(phoneme_length * rate) / rate numpy.save("hiho_phoneme_length.npy", phoneme_length) # yukarin_sa model_dir = Path( "data/yukarin_sa/withjsss-lr1.0e-03-ehs32-aehs32-pl2-pn8-fl2-fn2-try1") with (model_dir / "config.yaml").open() as f: d = yaml.safe_load(f) generator_sa = GeneratorSa( config=ConfigSa.from_dict(d), predictor=_get_predictor_model_path(model_dir), use_gpu=False, ) assert generator_sa.config.dataset.f0_process_mode == "voiced_mora" ( consonant_phoneme_data_list, vowel_phoneme_data_list, vowel_indexes_data, ) = split_mora(phoneme_data_list) vowel_indexes = numpy.array(vowel_indexes_data) vowel_phoneme_list = numpy.array( [p.phoneme_id for p in vowel_phoneme_data_list]) consonant_phoneme_list = numpy.array([ p.phoneme_id if p is not None else -1 for p in consonant_phoneme_data_list ]) phoneme_length_sa = numpy.array( [a.sum() for a in numpy.split(phoneme_length, vowel_indexes[:-1] + 1)]) f0_list = generator_sa.generate( vowel_phoneme_list=vowel_phoneme_list[numpy.newaxis], consonant_phoneme_list=consonant_phoneme_list[numpy.newaxis], start_accent_list=start_accent_list[vowel_indexes][numpy.newaxis], end_accent_list=end_accent_list[vowel_indexes][numpy.newaxis], start_accent_phrase_list=start_accent_phrase_list[vowel_indexes][ numpy.newaxis], end_accent_phrase_list=end_accent_phrase_list[vowel_indexes][ numpy.newaxis], speaker_id=speaker_id, )[0] for i, p in enumerate(vowel_phoneme_data_list): if p.phoneme in unvoiced_mora_phoneme_list: f0_list[i] = 0 numpy.save("hiho_f0_list.npy", f0_list) phoneme = numpy.repeat( phoneme_list_s, numpy.round(phoneme_length * rate).astype(numpy.int32)) f0 = numpy.repeat( f0_list, numpy.round(phoneme_length_sa * rate).astype(numpy.int32)) numpy.save("hiho_f0.npy", f0) # yukarin_soso with open( "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/config.yaml" ) as f: d = yaml.safe_load(f) generator_soso = GeneratorSoso( config=ConfigSoso.from_dict(d), predictor=Path( "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/predictor_220000.pth" ), use_gpu=False, ) assert generator_soso.config.dataset.f0_process_mode == "voiced_mora_mean" array = numpy.zeros((len(phoneme), JvsPhoneme.num_phoneme), dtype=numpy.float32) array[numpy.arange(len(phoneme)), phoneme] = 1 phoneme = array f0 = SamplingData(array=f0, rate=rate).resample(24000 / 256) phoneme = SamplingData(array=phoneme, rate=rate).resample(24000 / 256) spec = generator_soso.generate( f0=f0[numpy.newaxis, :, numpy.newaxis], phoneme=phoneme[numpy.newaxis], speaker_id=numpy.array(speaker_id).reshape(-1), )[0] numpy.save("hiho_spec.npy", spec) # hifi-gan wave = inference_hifigan( x=spec.T, checkpoint_file="data/hifigan/g_03080000", config_file="data/hifigan/config.json", ) # save soundfile.write("hiho_output.wav", data=wave, samplerate=24000) soundfile.write(f"{text}-{speaker_id}.wav", data=wave, samplerate=24000)
def create_trainer( config: Config, output: Path, ): # config config.add_git_info() output.mkdir(exist_ok=True, parents=True) with output.joinpath("config.yaml").open(mode="w") as f: yaml.safe_dump(config.to_dict(), f) # model predictor = create_predictor(config.network) model = Model(model_config=config.model, predictor=predictor) if config.train.weight_initializer is not None: init_weights(model, name=config.train.weight_initializer) device = torch.device("cuda") model.to(device) # dataset _create_iterator = partial( create_iterator, batch_size=config.train.batch_size, num_processes=config.train.num_processes, use_multithread=config.train.use_multithread, ) datasets = create_dataset(config.dataset) train_iter = _create_iterator(datasets["train"], for_train=True) test_iter = _create_iterator(datasets["test"], for_train=False) eval_iter = _create_iterator(datasets["test"], for_train=False, for_eval=True) valid_iter = None if datasets["valid"] is not None: valid_iter = _create_iterator(datasets["valid"], for_train=False, for_eval=True) warnings.simplefilter("error", MultiprocessIterator.TimeoutWarning) # optimizer optimizer = make_optimizer(config_dict=config.train.optimizer, model=model) # updater if not config.train.use_amp: updater = StandardUpdater( iterator=train_iter, optimizer=optimizer, model=model, device=device, ) else: updater = AmpUpdater( iterator=train_iter, optimizer=optimizer, model=model, device=device, ) # trainer trigger_log = (config.train.log_iteration, "iteration") trigger_eval = (config.train.eval_iteration, "iteration") trigger_snapshot = (config.train.snapshot_iteration, "iteration") trigger_stop = ((config.train.stop_iteration, "iteration") if config.train.stop_iteration is not None else None) trainer = Trainer(updater, stop_trigger=trigger_stop, out=output) if config.train.step_shift is not None: ext = extensions.StepShift(**config.train.step_shift) trainer.extend(ext) ext = extensions.Evaluator(test_iter, model, device=device) trainer.extend(ext, name="test", trigger=trigger_log) generator = Generator(config=config, predictor=predictor, use_gpu=True) generate_evaluator = GenerateEvaluator(generator=generator) ext = extensions.Evaluator(eval_iter, generate_evaluator, device=device) trainer.extend(ext, name="eval", trigger=trigger_eval) if valid_iter is not None: ext = extensions.Evaluator(valid_iter, generate_evaluator, device=device) trainer.extend(ext, name="valid", trigger=trigger_eval) ext = extensions.snapshot_object( predictor, filename="predictor_{.updater.iteration}.pth", n_retains=5, ) trainer.extend( ext, trigger=LowValueTrigger("eval/main/mcd", trigger=trigger_eval), ) trainer.extend(extensions.FailOnNonNumber(), trigger=trigger_log) trainer.extend(extensions.observe_lr(), trigger=trigger_log) trainer.extend(extensions.LogReport(trigger=trigger_log)) trainer.extend( extensions.PrintReport(["iteration", "main/loss", "test/main/loss"]), trigger=trigger_log, ) ext = TensorboardReport(writer=SummaryWriter(Path(output))) trainer.extend(ext, trigger=trigger_log) if config.project.category is not None: ext = WandbReport( config_dict=config.to_dict(), project_category=config.project.category, project_name=config.project.name, output_dir=output.joinpath("wandb"), ) trainer.extend(ext, trigger=trigger_log) (output / "struct.txt").write_text(repr(model)) if trigger_stop is not None: trainer.extend(extensions.ProgressBar(trigger_stop)) ext = extensions.snapshot_object( trainer, filename="trainer_{.updater.iteration}.pth", n_retains=1, autoload=True, ) trainer.extend(ext, trigger=trigger_snapshot) return trainer