Esempio n. 1
0
def objective(
    trial: optuna.Trial,
    name: str,
    config_yaml_path: Path,
    optuna_config_path: Path,
    root_output: Path,
):
    with config_yaml_path.open() as f:
        config = Config.from_dict(yaml.safe_load(f))

    config = modify_config(config=config,
                           optuna_config_path=optuna_config_path,
                           trial=trial)
    postfix = param_dict_to_name(trial.params)
    config.project.name = f"{name}-" + postfix
    output = root_output.joinpath(f"{trial.number}-" + config.project.name)

    trainer = create_trainer(config=config, output=output)
    trainer.extend(
        PruningExtension(
            trial=trial,
            observation_key=config.train.optuna["key"],
            pruner_trigger=(config.train.optuna["iteration"], "iteration"),
        ), )
    trainer.run()

    log_last = trainer.get_extension("LogReport").log[-1]
    return log_last[config.train.optuna["key"]]
Esempio n. 2
0
def generate(
    model_dir: Path,
    model_iteration: Optional[int],
    model_config: Optional[Path],
    time_second: float,
    num_test: int,
    output_dir: Path,
    use_gpu: bool,
):
    if model_config is None:
        model_config = model_dir / "config.yaml"

    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", generate, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    model_path = _get_predictor_model_path(
        model_dir=model_dir,
        iteration=model_iteration,
    )
    generator = Generator(
        config=config,
        predictor=model_path,
        use_gpu=use_gpu,
    )

    sampling_rate = 24000 / 512
    config.dataset.sampling_length = int(sampling_rate * time_second)

    batch_size = config.train.batch_size

    dataset = create_dataset(config.dataset)["test"]
    if isinstance(dataset, ConcatDataset):
        dataset = dataset.datasets[0]

    if isinstance(dataset.dataset, FeatureDataset):
        f0_paths = [inp.f0_path for inp in dataset.dataset.inputs[:num_test]]
    elif isinstance(dataset.dataset, SpeakerFeatureDataset):
        f0_paths = [
            inp.f0_path for inp in dataset.dataset.dataset.inputs[:num_test]
        ]
    else:
        raise ValueError(dataset)

    for data, f0_path in zip(
            chunked(tqdm(dataset, desc="generate"), batch_size),
            chunked(f0_paths, batch_size),
    ):
        data = concat_examples(data)
        specs = generator.generate(
            f0=data["f0"],
            phoneme=data["phoneme"],
            speaker_id=data["speaker_id"] if "speaker_id" in data else None,
        )

        for spec, p in zip(specs, f0_path):
            numpy.save(output_dir.joinpath(p.stem + ".npy"), spec)
Esempio n. 3
0
def train(
    config_yaml_path: Path,
    output: Path,
):
    with config_yaml_path.open() as f:
        config = Config.from_dict(yaml.safe_load(f))

    trainer = create_trainer(config=config, output=output)
    trainer.run()
Esempio n. 4
0
def extract_fn_list(
    config_yaml_path: Path,
    output_train_path: Path,
    output_test_path: Path,
):
    with config_yaml_path.open() as f:
        config = Config.from_dict(yaml.safe_load(f)).dataset

    f0_paths = {Path(p).stem: Path(p) for p in glob(config.f0_glob)}
    fn_list = sorted(f0_paths.keys())
    assert len(fn_list) > 0

    numpy.random.RandomState(config.seed).shuffle(fn_list)

    test_num = config.test_num
    trains = fn_list[test_num:]
    tests = fn_list[:test_num]

    output_train_path.write_text("\n".join(sorted(trains)))
    output_test_path.write_text("\n".join(sorted(tests)))
Esempio n. 5
0
def generate_all(
    model_dir: Path,
    model_iteration: Optional[int],
    model_config: Optional[Path],
    output_dir: Path,
    transpose: bool,
    use_gpu: bool,
):
    if model_config is None:
        model_config = model_dir / "config.yaml"

    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", generate_all, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    model_path = _get_predictor_model_path(
        model_dir=model_dir,
        iteration=model_iteration,
    )
    generator = Generator(
        config=config,
        predictor=model_path,
        use_gpu=use_gpu,
    )

    config.dataset.test_num = 0
    dataset = create_dataset(config.dataset)["train"]

    if isinstance(dataset.dataset, FeatureDataset):
        inputs = dataset.dataset.inputs
        speaker_ids = [None] * len(inputs)
    elif isinstance(dataset.dataset, SpeakerFeatureDataset):
        inputs = dataset.dataset.dataset.inputs
        speaker_ids = dataset.dataset.speaker_ids
    else:
        raise ValueError(dataset)

    for input, speaker_id in tqdm(zip(inputs, speaker_ids),
                                  total=len(inputs),
                                  desc="generate_all"):
        input_data = input.generate()
        data = FeatureDataset.extract_input(
            sampling_length=len(input_data.spec.array),
            f0_data=input_data.f0,
            phoneme_data=input_data.phoneme,
            spec_data=input_data.spec,
            silence_data=input_data.silence,
            phoneme_list_data=input_data.phoneme_list,
            f0_process_mode=F0ProcessMode(config.dataset.f0_process_mode),
            time_mask_max_second=0,
        )

        spec = generator.generate(
            f0=data["f0"][numpy.newaxis],
            phoneme=data["phoneme"][numpy.newaxis],
            speaker_id=(numpy.array(speaker_id)[numpy.newaxis]
                        if speaker_id is not None else None),
        )[0]

        if transpose:
            spec = spec.T

        name = input.f0_path.stem
        numpy.save(output_dir.joinpath(name + ".npy"), spec)
Esempio n. 6
0
def test_equal_base_config_and_reconstructed(train_config_path: Path):
    with train_config_path.open() as f:
        d = yaml.load(f, SafeLoader)
    base = Config.from_dict(d)
    base_re = Config.from_dict(base.to_dict())
    assert base == base_re
Esempio n. 7
0
def test_to_dict(train_config_path: Path):
    with train_config_path.open() as f:
        d = yaml.load(f, SafeLoader)
    Config.from_dict(d).to_dict()
Esempio n. 8
0
def run(text: str, speaker_id: int):
    rate = 200

    # phoneme
    utterance = extract_full_context_label(text)

    # utterance.breath_groups[0].accent_phrases[2].accent = 2
    # utterance.breath_groups[1].accent_phrases[1].accent = 6
    # utterance.breath_groups[1].accent_phrases[3].accent = 5

    x, sr = pyopenjtalk.synthesize(utterance.labels, speed=1, half_tone=0)
    x /= 2**16
    soundfile.write("hiho_openjtalk_wave.wav", x, sr)

    label_data_list = utterance.phonemes

    json.dump([p.label for p in label_data_list],
              open("hiho_label_list.json", mode="w"))

    is_type1 = False
    phoneme_str_list = []
    start_accent_list = numpy.ones(len(label_data_list),
                                   dtype=numpy.int64) * numpy.nan
    end_accent_list = numpy.ones(len(label_data_list),
                                 dtype=numpy.int64) * numpy.nan
    start_accent_phrase_list = (
        numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan)
    end_accent_phrase_list = (
        numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan)
    for i, label in enumerate(label_data_list):
        is_end_accent = label.contexts["a1"] == "0"

        if label.contexts["a2"] == "1":
            is_type1 = is_end_accent

        if label.contexts["a2"] == "1" and is_type1:
            is_start_accent = True
        elif label.contexts["a2"] == "2" and not is_type1:
            is_start_accent = True
        else:
            is_start_accent = False

        phoneme_str_list.append(label.phoneme)
        start_accent_list[i] = is_start_accent
        end_accent_list[i] = is_end_accent
        start_accent_phrase_list[i] = label.contexts["a2"] == "1"
        end_accent_phrase_list[i] = label.contexts["a3"] == "1"

    start_accent_list = numpy.array(start_accent_list, dtype=numpy.int64)
    end_accent_list = numpy.array(end_accent_list, dtype=numpy.int64)
    start_accent_phrase_list = numpy.array(start_accent_phrase_list,
                                           dtype=numpy.int64)
    end_accent_phrase_list = numpy.array(end_accent_phrase_list,
                                         dtype=numpy.int64)

    json.dump(phoneme_str_list, open("hiho_phoneme_list.json", mode="w"))

    # yukarin_s
    with open("data/yukarin_s/check-bs128-hs32/config.yaml") as f:
        d = yaml.safe_load(f)

    generator_s = GeneratorS(
        config=ConfigS.from_dict(d),
        predictor=Path("data/yukarin_s/check-bs128-hs32/predictor_50000.pth"),
        use_gpu=False,
    )

    phoneme_data_list = [
        JvsPhoneme(phoneme=p, start=i, end=i + 1)
        for i, p in enumerate(phoneme_str_list)
    ]
    phoneme_data_list = JvsPhoneme.convert(phoneme_data_list)
    phoneme_list_s = numpy.array([p.phoneme_id for p in phoneme_data_list])

    phoneme_length = generator_s.generate(
        phoneme_list=phoneme_list_s,
        speaker_id=speaker_id,
    )
    phoneme_length[0] = phoneme_length[-1] = 0.1
    phoneme_length = numpy.round(phoneme_length * rate) / rate
    numpy.save("hiho_phoneme_length.npy", phoneme_length)

    # yukarin_sa
    model_dir = Path(
        "data/yukarin_sa/withjsss-lr1.0e-03-ehs32-aehs32-pl2-pn8-fl2-fn2-try1")
    with (model_dir / "config.yaml").open() as f:
        d = yaml.safe_load(f)

    generator_sa = GeneratorSa(
        config=ConfigSa.from_dict(d),
        predictor=_get_predictor_model_path(model_dir),
        use_gpu=False,
    )

    assert generator_sa.config.dataset.f0_process_mode == "voiced_mora"
    (
        consonant_phoneme_data_list,
        vowel_phoneme_data_list,
        vowel_indexes_data,
    ) = split_mora(phoneme_data_list)

    vowel_indexes = numpy.array(vowel_indexes_data)

    vowel_phoneme_list = numpy.array(
        [p.phoneme_id for p in vowel_phoneme_data_list])
    consonant_phoneme_list = numpy.array([
        p.phoneme_id if p is not None else -1
        for p in consonant_phoneme_data_list
    ])
    phoneme_length_sa = numpy.array(
        [a.sum() for a in numpy.split(phoneme_length, vowel_indexes[:-1] + 1)])

    f0_list = generator_sa.generate(
        vowel_phoneme_list=vowel_phoneme_list[numpy.newaxis],
        consonant_phoneme_list=consonant_phoneme_list[numpy.newaxis],
        start_accent_list=start_accent_list[vowel_indexes][numpy.newaxis],
        end_accent_list=end_accent_list[vowel_indexes][numpy.newaxis],
        start_accent_phrase_list=start_accent_phrase_list[vowel_indexes][
            numpy.newaxis],
        end_accent_phrase_list=end_accent_phrase_list[vowel_indexes][
            numpy.newaxis],
        speaker_id=speaker_id,
    )[0]

    for i, p in enumerate(vowel_phoneme_data_list):
        if p.phoneme in unvoiced_mora_phoneme_list:
            f0_list[i] = 0

    numpy.save("hiho_f0_list.npy", f0_list)

    phoneme = numpy.repeat(
        phoneme_list_s,
        numpy.round(phoneme_length * rate).astype(numpy.int32))
    f0 = numpy.repeat(
        f0_list,
        numpy.round(phoneme_length_sa * rate).astype(numpy.int32))

    numpy.save("hiho_f0.npy", f0)

    # yukarin_soso
    with open(
            "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/config.yaml"
    ) as f:
        d = yaml.safe_load(f)

    generator_soso = GeneratorSoso(
        config=ConfigSoso.from_dict(d),
        predictor=Path(
            "data/yukarin_soso/f0mean-wei_voicedmora-sl1280-bs128-lr1.0e-03-mt0.2-mn32-try1/predictor_220000.pth"
        ),
        use_gpu=False,
    )
    assert generator_soso.config.dataset.f0_process_mode == "voiced_mora_mean"

    array = numpy.zeros((len(phoneme), JvsPhoneme.num_phoneme),
                        dtype=numpy.float32)
    array[numpy.arange(len(phoneme)), phoneme] = 1
    phoneme = array

    f0 = SamplingData(array=f0, rate=rate).resample(24000 / 256)
    phoneme = SamplingData(array=phoneme, rate=rate).resample(24000 / 256)

    spec = generator_soso.generate(
        f0=f0[numpy.newaxis, :, numpy.newaxis],
        phoneme=phoneme[numpy.newaxis],
        speaker_id=numpy.array(speaker_id).reshape(-1),
    )[0]
    numpy.save("hiho_spec.npy", spec)

    # hifi-gan
    wave = inference_hifigan(
        x=spec.T,
        checkpoint_file="data/hifigan/g_03080000",
        config_file="data/hifigan/config.json",
    )

    # save
    soundfile.write("hiho_output.wav", data=wave, samplerate=24000)
    soundfile.write(f"{text}-{speaker_id}.wav", data=wave, samplerate=24000)
Esempio n. 9
0
def create_trainer(
    config: Config,
    output: Path,
):
    # config
    config.add_git_info()

    output.mkdir(exist_ok=True, parents=True)
    with output.joinpath("config.yaml").open(mode="w") as f:
        yaml.safe_dump(config.to_dict(), f)

    # model
    predictor = create_predictor(config.network)
    model = Model(model_config=config.model, predictor=predictor)
    if config.train.weight_initializer is not None:
        init_weights(model, name=config.train.weight_initializer)

    device = torch.device("cuda")
    model.to(device)

    # dataset
    _create_iterator = partial(
        create_iterator,
        batch_size=config.train.batch_size,
        num_processes=config.train.num_processes,
        use_multithread=config.train.use_multithread,
    )

    datasets = create_dataset(config.dataset)
    train_iter = _create_iterator(datasets["train"], for_train=True)
    test_iter = _create_iterator(datasets["test"], for_train=False)
    eval_iter = _create_iterator(datasets["test"],
                                 for_train=False,
                                 for_eval=True)

    valid_iter = None
    if datasets["valid"] is not None:
        valid_iter = _create_iterator(datasets["valid"],
                                      for_train=False,
                                      for_eval=True)

    warnings.simplefilter("error", MultiprocessIterator.TimeoutWarning)

    # optimizer
    optimizer = make_optimizer(config_dict=config.train.optimizer, model=model)

    # updater
    if not config.train.use_amp:
        updater = StandardUpdater(
            iterator=train_iter,
            optimizer=optimizer,
            model=model,
            device=device,
        )
    else:
        updater = AmpUpdater(
            iterator=train_iter,
            optimizer=optimizer,
            model=model,
            device=device,
        )

    # trainer
    trigger_log = (config.train.log_iteration, "iteration")
    trigger_eval = (config.train.eval_iteration, "iteration")
    trigger_snapshot = (config.train.snapshot_iteration, "iteration")
    trigger_stop = ((config.train.stop_iteration, "iteration")
                    if config.train.stop_iteration is not None else None)

    trainer = Trainer(updater, stop_trigger=trigger_stop, out=output)

    if config.train.step_shift is not None:
        ext = extensions.StepShift(**config.train.step_shift)
        trainer.extend(ext)

    ext = extensions.Evaluator(test_iter, model, device=device)
    trainer.extend(ext, name="test", trigger=trigger_log)

    generator = Generator(config=config, predictor=predictor, use_gpu=True)
    generate_evaluator = GenerateEvaluator(generator=generator)
    ext = extensions.Evaluator(eval_iter, generate_evaluator, device=device)
    trainer.extend(ext, name="eval", trigger=trigger_eval)
    if valid_iter is not None:
        ext = extensions.Evaluator(valid_iter,
                                   generate_evaluator,
                                   device=device)
        trainer.extend(ext, name="valid", trigger=trigger_eval)

    ext = extensions.snapshot_object(
        predictor,
        filename="predictor_{.updater.iteration}.pth",
        n_retains=5,
    )
    trainer.extend(
        ext,
        trigger=LowValueTrigger("eval/main/mcd", trigger=trigger_eval),
    )

    trainer.extend(extensions.FailOnNonNumber(), trigger=trigger_log)
    trainer.extend(extensions.observe_lr(), trigger=trigger_log)
    trainer.extend(extensions.LogReport(trigger=trigger_log))
    trainer.extend(
        extensions.PrintReport(["iteration", "main/loss", "test/main/loss"]),
        trigger=trigger_log,
    )

    ext = TensorboardReport(writer=SummaryWriter(Path(output)))
    trainer.extend(ext, trigger=trigger_log)

    if config.project.category is not None:
        ext = WandbReport(
            config_dict=config.to_dict(),
            project_category=config.project.category,
            project_name=config.project.name,
            output_dir=output.joinpath("wandb"),
        )
        trainer.extend(ext, trigger=trigger_log)

    (output / "struct.txt").write_text(repr(model))

    if trigger_stop is not None:
        trainer.extend(extensions.ProgressBar(trigger_stop))

    ext = extensions.snapshot_object(
        trainer,
        filename="trainer_{.updater.iteration}.pth",
        n_retains=1,
        autoload=True,
    )
    trainer.extend(ext, trigger=trigger_snapshot)

    return trainer