Exemple #1
0
def calc_mcd(
    path1: Optional[Path] = None,
    path2: Optional[Path] = None,
    wave1: Optional[Wave] = None,
    wave2: Optional[Wave] = None,
):
    wave1 = Wave.load(path1) if wave1 is None else wave1
    wave2 = Wave.load(path2) if wave2 is None else wave2
    assert wave1.sampling_rate == wave2.sampling_rate

    sampling_rate = wave1.sampling_rate

    min_length = min(len(wave1.wave), len(wave2.wave))
    wave1.wave = wave1.wave[:min_length]
    wave2.wave = wave2.wave[:min_length]

    mc1 = to_melcepstrum(
        x=wave1.wave,
        sampling_rate=sampling_rate,
        n_fft=2048,
        win_length=1024,
        hop_length=256,
        order=24,
    )
    mc2 = to_melcepstrum(
        x=wave2.wave,
        sampling_rate=sampling_rate,
        n_fft=2048,
        win_length=1024,
        hop_length=256,
        order=24,
    )
    return _mcd(mc1, mc2)
Exemple #2
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         f0=SamplingData.load(self.path_f0),
         phoneme=SamplingData.load(self.path_phoneme),
     )
Exemple #3
0
def _process(path: Path, bit: int, gaussian_noise_sigma: float):
    wave = Wave.load(path).wave

    if gaussian_noise_sigma > 0:
        wave += numpy.random.randn(*wave.shape) * gaussian_noise_sigma

    encoded = encode_single(encode_mulaw(wave, mu=2 ** bit), bit=bit)
    return numpy.histogram(encoded, bins=2 ** bit, range=(0, 2 ** bit))[0].astype(
        numpy.uint64
    )
Exemple #4
0
def calc_silence_rate(
    path1: Optional[Path] = None,
    path2: Optional[Path] = None,
    wave1: Optional[Wave] = None,
    wave2: Optional[Wave] = None,
):
    wave1 = Wave.load(path1) if wave1 is None else wave1
    wave2 = Wave.load(path2) if wave2 is None else wave2
    assert wave1.sampling_rate == wave2.sampling_rate

    silence1 = ~librosa.effects._signal_to_frame_nonsilent(wave1.wave)
    silence2 = ~librosa.effects._signal_to_frame_nonsilent(wave2.wave)

    tp = numpy.logical_and(silence1, silence2).sum(dtype=float)
    tn = numpy.logical_and(~silence1, ~silence2).sum(dtype=float)
    fn = numpy.logical_and(silence1, ~silence2).sum(dtype=float)
    fp = numpy.logical_and(~silence1, silence2).sum(dtype=float)

    accuracy = (tp + tn) / (tp + tn + fn + fp)
    return accuracy
Exemple #5
0
    def generate(self):
        wave = Wave.load(self.path_wave)

        try:
            local = SamplingData.load(self.path_local)
        except:
            local_rate = 80
            local_array = to_log_melspectrogram(wave=wave, rate=local_rate)
            local = SamplingData(array=local_array, rate=local_rate)

            with NamedTemporaryFile(suffix=".npy", delete=False) as f:
                self.path_local = Path(f.name)
                local.save(self.path_local)

        return Input(
            wave=wave,
            silence=SamplingData.load(self.path_silence),
            local=local,
        )
Exemple #6
0
def process(
    input_paths: Tuple[Path, Path],
    output_dir: Path,
):
    input_wave, input_f0 = input_paths

    wave_data = Wave.load(input_wave)
    f0_data = F0.load(input_f0)

    y = wave_data.wave.astype(np.float64)
    sr = wave_data.sampling_rate

    f0 = np.exp(f0_data.array[:, 0].astype(np.float64))
    if f0_data.with_vuv:
        f0[~f0_data.array[:, 1]] = 0

    t = np.arange(0, len(f0), dtype=np.float64) / f0_data.rate
    sp = pyworld.cheaptrick(y, f0, t, sr)
    ap = pyworld.d4c(y, f0, t, sr)

    y = pyworld.synthesize(f0, sp, ap, sr)

    out = output_dir / f"{input_f0.stem}.wav"
    librosa.output.write_wav(out, y.astype(np.float32), sr)
Exemple #7
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         local=SamplingData.load(self.path_local),
     )
Exemple #8
0
def generate(
    model_dir: Path,
    model_iteration: Optional[int],
    model_config: Optional[Path],
    output_dir: Path,
    to_voiced_scaler: bool,
    to_f0_scaler: bool,
    to_phoneme_onehot: bool,
    batch_size: Optional[int],
    num_test: int,
    target_glob: Optional[str],
    use_gpu: bool,
):
    if model_config is None:
        model_config = model_dir / "config.yaml"

    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", generate, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    generator = Generator(
        config=config,
        predictor=_get_model_path(
            model_dir=model_dir,
            iteration=model_iteration,
            prefix="predictor_",
        ),
        voiced_network=(
            None
            if not to_voiced_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="voiced_network_",
            )
        ),
        f0_network=(
            None
            if not to_f0_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="f0_network_",
            )
        ),
        phoneme_network=(
            None
            if not to_phoneme_onehot
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="phoneme_network_",
            )
        ),
        use_gpu=use_gpu,
    )

    dataset = create_dataset(config.dataset)["test"]
    scale = numpy.prod(config.network.scale_list)

    if batch_size is None:
        batch_size = config.train.batch_size

    if isinstance(dataset, SpeakerWavesDataset):
        wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]]
    elif isinstance(dataset, WavesDataset):
        wave_paths = [data.path_wave for data in dataset.inputs[:num_test]]
    else:
        raise Exception()

    if target_glob is not None:
        wave_paths += list(map(Path, glob(target_glob)))

    for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"):
        waves = [Wave.load(p) for p in wps]
        arrays = [w.wave for w in waves]

        pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays]
        arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays]

        tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays]
        output = generator.generate(
            wave=concat_examples(tensors),
            to_voiced_scaler=to_voiced_scaler,
            to_f0_scaler=to_f0_scaler,
            to_phoneme_onehot=to_phoneme_onehot,
        )

        for feature, p, w, l in zip(output, wps, waves, pad_lengths):
            feature = feature.T[: l // scale]
            data = SamplingData(array=feature, rate=w.sampling_rate // scale)
            data.save(output_dir / (p.stem + ".npy"))
Exemple #9
0
def main():
    model_dir: Path = arguments.model_dir
    model_iteration: int = arguments.model_iteration
    model_config: Path = arguments.model_config
    time_length: float = arguments.time_length
    gpu: int = arguments.gpu

    config = create_config(model_config)
    model_path = _get_predictor_model_path(model_dir, model_iteration)

    sr = config.dataset.sampling_rate

    model = create_predictor(config.model)
    chainer.serializers.load_npz(str(model_path), model)
    if gpu is not None:
        model.to_gpu(gpu)
        cuda.get_device_from_id(gpu).use()

    chainer.global_config.train = False
    chainer.global_config.enable_backprop = False

    wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))])
    local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))])
    assert len(wave_paths) == len(local_paths)

    np.random.RandomState(config.dataset.seed).shuffle(wave_paths)
    np.random.RandomState(config.dataset.seed).shuffle(local_paths)
    wave_path = wave_paths[0]
    local_path = local_paths[0]
    w_data = Wave.load(wave_path, sampling_rate=sr)
    l_data = SamplingData.load(local_path)

    length = int(sr * time_length)
    l_scale = int(sr // l_data.rate)
    l_sl = length // l_scale
    length = l_sl * l_scale

    w = w_data.wave[:length]
    l = l_data.array[:l_sl]
    coarse, fine = encode_16bit(w)

    c, f, hc, hf = model(
        c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis],
        f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis],
        l_array=model.xp.asarray(l)[np.newaxis],
    )

    c = chainer.functions.softmax(c)

    c = chainer.cuda.to_cpu(c[0].data)
    f = chainer.cuda.to_cpu(f[0].data)

    fig = plt.figure(figsize=[32 * time_length, 10])

    plt.imshow(c, aspect='auto', interpolation='nearest')
    plt.colorbar()

    plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true')
    plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted')
    plt.legend()

    fig.savefig('output.eps')
def collect_to_tfevents(
    input_dir: Path,
    output_dir: Optional[Path],
    filename_suffix: str,
    audio_tag_format: str,
    diff_tag: str,
    iteration_format: str,
    remove_exist: bool,
    expected_wave_dir: Optional[Path],
):
    if output_dir is None:
        output_dir = input_dir

    if remove_exist:
        for p in output_dir.glob(f"*tfevents*{filename_suffix}"):
            p.unlink()

    flag_calc_diff = expected_wave_dir is not None

    summary_writer = SummaryWriter(logdir=str(output_dir),
                                   filename_suffix=filename_suffix)

    diffs: DefaultDict[int, List[float]] = defaultdict(list)
    for p in tqdm(sorted(input_dir.rglob("*"), key=_to_nums),
                  desc=input_dir.stem):
        if p.is_dir():
            continue

        if "tfevents" in p.name:
            continue

        rp = p.relative_to(input_dir)
        iteration = int(iteration_format.format(p=p, rp=rp))

        # audio
        if p.suffix in [".wav"]:
            wave, sr = librosa.load(str(p), sr=None)
            summary_writer.add_audio(
                tag=audio_tag_format.format(p=p, rp=rp),
                snd_tensor=wave,
                sample_rate=sr,
                global_step=iteration,
            )

        # diff
        if flag_calc_diff and p.name.endswith("_woc.wav"):
            wave_id = p.name[:-8]
            expected = expected_wave_dir.joinpath(f"{wave_id}.wav")

            wo = Wave.load(p)
            wi = Wave.load(expected, sampling_rate=wo.sampling_rate)

            diff = calc_mcd(wave1=wi, wave2=wo)
            diffs[iteration].append(diff)

    if flag_calc_diff:
        for iteration, values in sorted(diffs.items()):
            summary_writer.add_scalar(
                tag=diff_tag,
                scalar_value=numpy.mean(values),
                global_step=iteration,
            )

    summary_writer.close()
Exemple #11
0
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)):
    with TemporaryDirectory() as d:
        tmp_dir = Path(d)
        input_audio_path = tmp_dir.joinpath("input.wav")
        input_audio_path.write_bytes(await wave.read())

        # openjtalk
        phonemes = [
            p.label
            for p in openjtalk_label_getter(
                text,
                openjtalk_command="open_jtalk",
                dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"),
                htsvoice_path=Path(
                    "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice"
                ),
                output_wave_path=tmp_dir.joinpath("wave.wav"),
                output_log_path=tmp_dir.joinpath("log.txt"),
                output_type=OutputType.phoneme,
                without_span=False,
            )
        ]

        # julius
        julius_audio_path = tmp_dir.joinpath("julius.wav")
        subprocess.check_call(
            f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split()
        )

        julius_phonemes = [
            p if p not in _jvs_to_julius else _jvs_to_julius[p]
            for p in phonemes
            if p != "sil"
        ]

        julius_dict_path = tmp_dir.joinpath("2nd.dict")
        julius_dict = sp_inserter.gen_julius_dict_2nd(
            " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm
        )
        julius_dict_path.write_text(julius_dict)

        julius_dfa_path = tmp_dir.joinpath("2nd.dfa")
        julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n"))
        julius_dfa_path.write_text(julius_dfa)

        julius_output = sp_inserter.julius_phone_alignment(
            str(julius_audio_path),
            str(tmp_dir.joinpath("2nd")),
            _hmm_model,
            model_type=sp_inserter.ModelType.gmm,
            options=None,
        )

        time_alignment_list = sp_inserter.frame_to_second(
            sp_inserter.get_time_alimented_list(julius_output)
        )

        i_phoneme = 0
        new_phonemes = []
        for p in phonemes:
            if p == "pau" and time_alignment_list[i_phoneme][2] != "sp":
                continue
            i_phoneme += 1
            new_phonemes.append(p)

        aligned = JvsPhoneme.convert(
            [
                JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p)
                for p, o in zip(new_phonemes, time_alignment_list)
            ]
        )
        for p in aligned:
            p.verify()

        # world
        f0 = F0.from_wave(
            Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64),
            frame_period=5.0,
            f0_floor=71.0,
            f0_ceil=800,
            with_vuv=False,
            f0_type=F0Type.world,
        )
        converted_f0 = f0.convert(
            input_mean=f0.valid_f0_log.mean(),
            input_var=f0.valid_f0_log.var(),
            target_mean=_voiro_mean,
            target_var=f0.valid_f0_log.var(),
        )
        converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1)

        # feature
        phoneme_array = LinguisticFeature(
            phonemes=aligned,
            phoneme_class=JvsPhoneme,
            rate=_feature_rate,
            feature_types=[LinguisticFeature.FeatureType.PHONEME],
        ).make_array()

        phoneme = SamplingData(array=phoneme_array, rate=_feature_rate)

        feature = SamplingData.collect(
            [converted_f0, phoneme],
            rate=_feature_rate,
            mode="min",
            error_time_length=0.015,
        )

    return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))