Esempio n. 1
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         f0=SamplingData.load(self.path_f0),
         phoneme=SamplingData.load(self.path_phoneme),
     )
Esempio n. 2
0
    def __getitem__(self, i: int):
        sampling_rate = self.sampling_rate
        length = self.sampling_length
        frequency = numpy.random.uniform(self.frequency_range[0],
                                         self.frequency_range[1])
        rand = numpy.random.rand()

        wave = numpy.sin(
            (2 * numpy.pi) * (numpy.arange(length, dtype=numpy.float32) *
                              frequency / sampling_rate + rand))

        local = numpy.log(
            numpy.ones(shape=(length // self.local_scale, 1),
                       dtype=numpy.float32) * frequency)

        silence = numpy.zeros(shape=(length, ), dtype=numpy.bool)

        return default_convert(
            self.make_input(
                wave_data=Wave(wave=wave, sampling_rate=sampling_rate),
                silence_data=SamplingData(array=silence, rate=sampling_rate),
                local_data=SamplingData(array=local,
                                        rate=sampling_rate //
                                        self.local_scale),
            ))
Esempio n. 3
0
def test_extract_input_with_dataset(
    sampling_length: int,
    f0_path: Path,
    phoneme_path: Path,
    phoneme_list_path: Path,
    silence_path: Path,
    spectrogram_path: Path,
    volume_path: Path,
    f0_process_mode: F0ProcessMode,
    time_mask_max_second: float,
    time_mask_num: int,
):
    f0 = SamplingData.load(f0_path)
    phoneme = SamplingData.load(phoneme_path)
    phoneme_list = JvsPhoneme.load_julius_list(phoneme_list_path)
    silence = SamplingData.load(silence_path)
    spectrogram = SamplingData.load(spectrogram_path)
    volume_data = SamplingData.load(volume_path)

    FeatureDataset.extract_input(
        sampling_length=sampling_length,
        f0_data=f0,
        phoneme_data=phoneme,
        spec_data=spectrogram,
        silence_data=silence,
        phoneme_list_data=phoneme_list,
        volume_data=volume_data,
        f0_process_mode=f0_process_mode,
        time_mask_max_second=time_mask_max_second,
        time_mask_num=time_mask_num,
    )
Esempio n. 4
0
def test_extract_input(sampling_length: int, data_length: int,
                       padding_length: int):
    silence_data = SamplingData(array=numpy.zeros(data_length, dtype=bool),
                                rate=1)
    spectrogram_data = SamplingData(
        array=numpy.linspace(start=1, stop=2, num=data_length)[:,
                                                               numpy.newaxis],
        rate=1,
    )
    for _ in range(100):
        spectrogram = extract_input(
            sampling_length=sampling_length,
            spectrogram_data=spectrogram_data,
            silence_data=silence_data,
            min_not_silence_length=min(sampling_length, data_length),
            padding_length=padding_length,
            padding_value=numpy.nan,
        )["spectrogram"]

        assert len(spectrogram) == sampling_length + padding_length * 2

        if sampling_length <= data_length:
            assert numpy.isnan(spectrogram).sum() <= padding_length * 2
        else:
            assert (numpy.isnan(spectrogram).sum() == sampling_length -
                    data_length + padding_length * 2)

        if padding_length == 0:
            data = spectrogram
        else:
            data = spectrogram[padding_length:-padding_length]
        assert (~numpy.isnan(data)).sum() >= min(sampling_length, data_length)
Esempio n. 5
0
    def test_convert_to_dict(self):
        sampling_rate = 800
        local_sampling_rate = 200
        scale = sampling_rate // local_sampling_rate
        time_length = 10
        sampling_length = 16

        wave_data = Wave(
            wave=numpy.linspace(
                0,
                sampling_rate * time_length,
                sampling_rate * time_length,
                endpoint=False,
            ),
            sampling_rate=sampling_rate,
        )
        silence_data = SamplingData(
            array=numpy.zeros((sampling_rate * time_length, ), dtype=bool),
            rate=sampling_rate,
        )
        local_data = SamplingData(
            array=numpy.linspace(
                0,
                sampling_rate * time_length,
                local_sampling_rate * time_length,
                endpoint=False,
            ),
            rate=local_sampling_rate,
        )

        wave, silence, local = BaseWaveDataset.extract_input(
            sampling_length,
            wave_data=wave_data,
            silence_data=silence_data,
            local_data=local_data,
            local_sampling_rate=local_sampling_rate,
            local_padding_size=0,
            local_mask_max_second=0,
            local_mask_num=0,
        )

        dataset = BaseWaveDataset(
            sampling_rate=sampling_rate,
            sampling_length=sampling_length,
            bit=10,
            mulaw=False,
            wave_random_max_second=0,
            wave_random_num=0,
            local_sampling_rate=local_sampling_rate,
            local_padding_size=0,
            local_mask_max_second=0,
            local_mask_num=0,
        )
        d = dataset.convert_input(wave, silence, local)
        self.assertEqual(len(d["coarse"]), sampling_length)
        self.assertEqual(len(d["encoded_coarse"]), sampling_length)
        self.assertEqual(len(d["silence"]), sampling_length - 1)
        self.assertEqual(len(d["local"]), sampling_length // scale)
Esempio n. 6
0
    def test_extract_input(self):
        for sampling_rate, local_sampling_rate, sampling_length, time_length in [
            [800, 200, 16, 10],
            [24000, 24000 / 256, 1024, 100],
        ]:
            with self.subTest(
                    sampling_rate=sampling_rate,
                    local_sampling_rate=local_sampling_rate,
                    sampling_length=sampling_length,
                    time_length=time_length,
            ):
                scale = sampling_rate // local_sampling_rate

                wave_data = Wave(
                    wave=numpy.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(sampling_rate * time_length),
                        endpoint=False,
                    ),
                    sampling_rate=sampling_rate,
                )
                silence_data = SamplingData(
                    array=numpy.zeros((sampling_rate * time_length, ),
                                      dtype=bool),
                    rate=sampling_rate,
                )
                local_data = SamplingData(
                    array=numpy.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(local_sampling_rate * time_length),
                        endpoint=False,
                    ),
                    rate=local_sampling_rate,
                )

                for _ in range(10):
                    wave, silence, local = BaseWaveDataset.extract_input(
                        sampling_length,
                        wave_data=wave_data,
                        silence_data=silence_data,
                        local_data=local_data,
                        local_sampling_rate=local_sampling_rate,
                        local_padding_size=0,
                        local_mask_max_second=0,
                        local_mask_num=0,
                    )

                    self.assertEqual(len(wave), sampling_length)
                    self.assertEqual(len(silence), sampling_length)
                    self.assertEqual(len(local), sampling_length // scale)

                    wave_as_local = wave.reshape(int(sampling_length // scale),
                                                 -1).min(axis=1)
                    self.assertTrue(numpy.all(wave_as_local == local))
Esempio n. 7
0
 def generate(self):
     return Input(
         f0=SamplingData.load(self.f0_path),
         phoneme=SamplingData.load(self.phoneme_path),
         spec=SamplingData.load(self.spec_path),
         silence=SamplingData.load(self.silence_path),
         phoneme_list=(self.phoneme_class.load_julius_list(
             self.phoneme_list_path)
                       if self.phoneme_list_path is not None else None),
         volume=(SamplingData.load(self.volume_path)
                 if self.volume_path is not None else None),
     )
Esempio n. 8
0
 def setUp(self):
     waves = [
         np.ones(self.num // 2) * -1,
         np.ones(self.num // 2),
     ]
     self.inputs = [
         Input(
             wave=Wave(wave=w, sampling_rate=self.sampling_rate),
             local=SamplingData(array=np.empty((len(w), 0)),
                                rate=self.sampling_rate),
             silence=SamplingData(array=np.zeros((len(w), ), dtype=bool),
                                  rate=self.sampling_rate),
         ) for w in waves
     ]
Esempio n. 9
0
def generate_dataset(
    dataset_directory: Path,
    data_num: int,
    f0_rate: int,
    phoneme_rate: int,
    phoneme_size: int,
    speaker_size: int,
):
    if dataset_directory.exists():
        for p in dataset_directory.rglob("*"):
            if not p.is_dir():
                p.unlink()
    else:
        dataset_directory.mkdir()

    f0_dir = dataset_directory.joinpath("f0")
    phoneme_dir = dataset_directory.joinpath("phoneme")
    phoneme_list_dir = dataset_directory.joinpath("phoneme_list")

    f0_dir.mkdir(exist_ok=True)
    phoneme_dir.mkdir(exist_ok=True)
    phoneme_list_dir.mkdir(exist_ok=True)

    speaker_dict = defaultdict(list)
    for i_data in range(data_num):
        speaker_num = i_data % speaker_size
        speaker_dict[str(speaker_num)].append(str(i_data))

        source_length = int(numpy.random.randint(low=10, high=20))
        phoneme_list = numpy.random.randint(low=0,
                                            high=phoneme_size,
                                            size=source_length,
                                            dtype=numpy.int32)
        phoneme_list_dir.joinpath(f"{i_data}.lab").write_text("\n".join(
            [f"0 0 {JvsPhoneme.phoneme_list[p]}" for p in phoneme_list]))

        f0 = phoneme_list.astype(numpy.float32) / 10 + 0.2 + speaker_num / 100
        f0 = numpy.repeat(f0, (phoneme_list + 1) * (f0_rate // phoneme_rate))
        f0[::5] = 0
        SamplingData(array=f0,
                     rate=f0_rate).save(f0_dir.joinpath(f"{i_data}.npy"))

        phoneme = numpy.repeat(phoneme_list, phoneme_list + 1)
        phoneme = numpy.identity(phoneme_size, dtype=numpy.int32)[phoneme]
        SamplingData(array=phoneme, rate=phoneme_rate).save(
            phoneme_dir.joinpath(f"{i_data}.npy"))

    json.dump(speaker_dict,
              dataset_directory.joinpath("speaker_dict.json").open("w"))
Esempio n. 10
0
def convert_f0(
    model_config: Path,
    input_glob: str,
    input_f0_statistics: Path,
    target_f0_statistics: Path,
    output_dir: Path,
):
    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", convert_f0, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    input_stat = numpy.load(input_f0_statistics, allow_pickle=True).item()
    target_stat = numpy.load(target_f0_statistics, allow_pickle=True).item()

    paths = list(map(Path, glob(input_glob)))

    for p in tqdm(paths, desc="convert_f0"):
        data = SamplingData.load(p)

        if data.array.shape[1] == (config.network.voiced_feature_size + 1 +
                                   config.network.phoneme_feature_size):
            f0_index = config.network.voiced_feature_size
        elif data.array.shape[1] == (1 + 1 + 40):
            f0_index = 1
        else:
            raise ValueError(data.array.shape[1])

        data.array[:, f0_index] += target_stat["mean"] - input_stat["mean"]
        data.save(output_dir / (p.stem + ".npy"))
Esempio n. 11
0
def process_wo_context(
    local_paths: Sequence[Path],
    speaker_nums: Optional[Sequence[int]],
    generator: Generator,
    postfix="_woc",
):
    try:
        local_datas = [
            SamplingData.load(local_path) for local_path in local_paths
        ]
        size = int((time_length + 5) * local_datas[0].rate)
        local_arrays = [
            local_data.array[:size]
            if len(local_data.array) >= size else np.pad(
                local_data.array,
                ((0, size - len(local_data.array)), (0, 0)),
                mode="edge",
            ) for local_data in local_datas
        ]

        waves = generator.generate(
            time_length=time_length,
            sampling_policy=sampling_policy,
            num_generate=len(local_arrays),
            local_array=np.stack(local_arrays),
            speaker_nums=speaker_nums,
        )
        for wave, local_path in zip(waves, local_paths):
            wave.save(output_dir / (local_path.stem + postfix + ".wav"))
    except:
        import traceback

        traceback.print_exc()
Esempio n. 12
0
    def __getitem__(self, i):
        data = self.datas[i]
        input = SamplingData.load(data.input_path).array
        vowel = numpy.squeeze(SamplingData.load(data.vowel_path).array)
        speaker_num = data.speaker_num

        assert len(vowel) <= len(
            input), f'{data.input_path.stem} cannot be processed.'
        if abs(len(vowel) - len(input)) >= 10:
            warn(f'{data.input_path.stem} is not matched.')

        input_vowel = input[:len(vowel)][vowel]
        i = numpy.random.randint(len(input_vowel))

        return default_convert(dict(
            input=input_vowel[i],
            target=speaker_num,
        ))
Esempio n. 13
0
def generate_and_save_data(
    feature_dir: Path,
    silence_dir: Path,
    wavelength: float,
    exponent: float,
    amplitude: float,
    length=300,
):
    feature, silence = generate_data(
        wavelength=wavelength,
        exponent=exponent,
        amplitude=amplitude,
        length=length,
    )

    filename = f"{wavelength}_{exponent}_{amplitude}.npy"
    SamplingData(array=feature, rate=100).save(feature_dir / filename)
    SamplingData(array=silence, rate=100).save(silence_dir / filename)
Esempio n. 14
0
def test_extract_input():
    sampling_length = 10
    wave_length = 256 * sampling_length
    wave_rate = 24000
    second = wave_length / wave_rate

    f0_rate = 200
    phoneme_rate = 100
    spec_rate = wave_rate / 256
    silence_rate = 24000

    f0 = numpy.arange(int(second * f0_rate)).reshape(-1,
                                                     1).astype(numpy.float32)
    f0_data = SamplingData(array=f0, rate=f0_rate)

    phoneme = (numpy.arange(int(second * phoneme_rate)).reshape(-1, 1).astype(
        numpy.float32))
    phoneme_data = SamplingData(array=phoneme, rate=phoneme_rate)

    spec = numpy.arange(int(second * spec_rate)).reshape(-1, 1).astype(
        numpy.float32)
    spec_data = SamplingData(array=spec, rate=spec_rate)

    silence = numpy.zeros(int(second * silence_rate)).astype(bool)
    silence_data = SamplingData(array=silence, rate=silence_rate)

    phoneme_list_data = None
    volume_data = None
    f0_process_mode = F0ProcessMode.normal
    time_mask_max_second = 0
    time_mask_num = 0

    FeatureDataset.extract_input(
        sampling_length=sampling_length,
        f0_data=f0_data,
        phoneme_data=phoneme_data,
        spec_data=spec_data,
        silence_data=silence_data,
        phoneme_list_data=phoneme_list_data,
        volume_data=volume_data,
        f0_process_mode=f0_process_mode,
        time_mask_max_second=time_mask_max_second,
        time_mask_num=time_mask_num,
    )
Esempio n. 15
0
def process_local_data(local_paths: Sequence[Path], time_length: float):
    local_datas = [SamplingData.load(local_path) for local_path in local_paths]
    size = int((time_length + 1) * local_datas[0].rate)
    local_arrays = [
        local_data.array[:size]
        if len(local_data.array) >= size
        else np.pad(
            local_data.array, ((0, size - len(local_data.array)), (0, 0)), mode="edge",
        )
        for local_data in local_datas
    ]
    return local_arrays
def process(args: Tuple[int, Path], sampling_lengths: Sequence[int]):
    i_data, path = args
    vector = numpy.empty(len(sampling_lengths), dtype=numpy.int32)

    data = SamplingData.load(path)
    array = ~numpy.squeeze(data.array)
    for i_length, sampling_length in enumerate(sampling_lengths):
        m = numpy.convolve(numpy.ones(sampling_length, dtype=numpy.int32),
                           array,
                           mode='valid').max()
        vector[i_length] = m

    return i_data, vector
Esempio n. 17
0
 def generate(self):
     return Input(
         phoneme_list=self.phoneme_class.load_julius_list(
             self.phoneme_list_path),
         start_accent_list=numpy.array([
             bool(int(s))
             for s in self.start_accent_list_path.read_text().split()
         ]),
         end_accent_list=numpy.array([
             bool(int(s))
             for s in self.end_accent_list_path.read_text().split()
         ]),
         start_accent_phrase_list=numpy.array([
             bool(int(s)) for s in
             self.start_accent_phrase_list_path.read_text().split()
         ]),
         end_accent_phrase_list=numpy.array([
             bool(int(s))
             for s in self.end_accent_phrase_list_path.read_text().split()
         ]),
         f0=SamplingData.load(self.f0_path),
         volume=(SamplingData.load(self.volume_path)
                 if self.volume_path is not None else None),
     )
Esempio n. 18
0
def process(
    generator: Generator,
    local_paths: Sequence[Path],
    local_sampling_rate: Optional[int],
    time_length: float,
    speaker_nums: Optional[Sequence[int]],
    sampling_policy: SamplingPolicy,
    output_dir: Path,
    postfix="",
):
    local_datas = [SamplingData.load(local_path) for local_path in local_paths]

    if local_sampling_rate is None:
        rate = local_datas[0].rate
        local_arrays = [l.array for l in local_datas]
    else:
        rate = local_sampling_rate
        local_arrays = [l.resample(rate) for l in local_datas]

    size = int((time_length + 5) * local_datas[0].rate)
    local_arrays = [
        l[:size] if len(l) >= size else numpy.pad(
            l,
            ((0, size - len(l)), (0, 0)),
            mode="edge",
        ) for l in local_arrays
    ]

    waves = generator.generate(
        time_length=time_length,
        sampling_policy=sampling_policy,
        num_generate=len(local_arrays),
        local_array=numpy.stack(local_arrays),
        speaker_nums=speaker_nums,
    )
    for wave, local_path in zip(waves, local_paths):
        wave.save(output_dir / (local_path.stem + postfix + ".wav"))
Esempio n. 19
0
    def generate(self):
        wave = Wave.load(self.path_wave)

        try:
            local = SamplingData.load(self.path_local)
        except:
            local_rate = 80
            local_array = to_log_melspectrogram(wave=wave, rate=local_rate)
            local = SamplingData(array=local_array, rate=local_rate)

            with NamedTemporaryFile(suffix=".npy", delete=False) as f:
                self.path_local = Path(f.name)
                local.save(self.path_local)

        return Input(
            wave=wave,
            silence=SamplingData.load(self.path_silence),
            local=local,
        )
Esempio n. 20
0
def main():
    model_dir: Path = arguments.model_dir
    model_iteration: int = arguments.model_iteration
    model_config: Path = arguments.model_config
    time_length: float = arguments.time_length
    gpu: int = arguments.gpu

    config = create_config(model_config)
    model_path = _get_predictor_model_path(model_dir, model_iteration)

    sr = config.dataset.sampling_rate

    model = create_predictor(config.model)
    chainer.serializers.load_npz(str(model_path), model)
    if gpu is not None:
        model.to_gpu(gpu)
        cuda.get_device_from_id(gpu).use()

    chainer.global_config.train = False
    chainer.global_config.enable_backprop = False

    wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))])
    local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))])
    assert len(wave_paths) == len(local_paths)

    np.random.RandomState(config.dataset.seed).shuffle(wave_paths)
    np.random.RandomState(config.dataset.seed).shuffle(local_paths)
    wave_path = wave_paths[0]
    local_path = local_paths[0]
    w_data = Wave.load(wave_path, sampling_rate=sr)
    l_data = SamplingData.load(local_path)

    length = int(sr * time_length)
    l_scale = int(sr // l_data.rate)
    l_sl = length // l_scale
    length = l_sl * l_scale

    w = w_data.wave[:length]
    l = l_data.array[:l_sl]
    coarse, fine = encode_16bit(w)

    c, f, hc, hf = model(
        c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis],
        f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis],
        l_array=model.xp.asarray(l)[np.newaxis],
    )

    c = chainer.functions.softmax(c)

    c = chainer.cuda.to_cpu(c[0].data)
    f = chainer.cuda.to_cpu(f[0].data)

    fig = plt.figure(figsize=[32 * time_length, 10])

    plt.imshow(c, aspect='auto', interpolation='nearest')
    plt.colorbar()

    plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true')
    plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted')
    plt.legend()

    fig.savefig('output.eps')
Esempio n. 21
0
    def test_extract_input_with_local_padding(self):
        for sampling_rate, local_sampling_rate, sampling_length, time_length, local_padding_size in [
            [800, 200, 16, 1, 100],
            [24000, 24000 / 256, 1024, 4, 1024],
        ]:
            with self.subTest(
                    sampling_rate=sampling_rate,
                    local_sampling_rate=local_sampling_rate,
                    sampling_length=sampling_length,
                    time_length=time_length,
                    local_padding_size=local_padding_size,
            ):
                scale = sampling_rate // local_sampling_rate

                wave_data = Wave(
                    wave=np.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(sampling_rate * time_length),
                        endpoint=False,
                    ),
                    sampling_rate=sampling_rate,
                )
                silence_data = SamplingData(
                    array=np.zeros((sampling_rate * time_length, ),
                                   dtype=bool),
                    rate=sampling_rate,
                )
                local_data = SamplingData(
                    array=np.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(local_sampling_rate * time_length),
                        endpoint=False,
                    ),
                    rate=local_sampling_rate,
                )
                for _ in range(10000):
                    wave, silence, local = BaseWaveDataset.extract_input(
                        sampling_length,
                        wave_data=wave_data,
                        silence_data=silence_data,
                        local_data=local_data,
                        local_padding_size=local_padding_size,
                        padding_value=np.nan,
                    )

                    self.assertEqual(len(wave), sampling_length)
                    self.assertEqual(len(silence), sampling_length)
                    self.assertEqual(
                        len(local),
                        (sampling_length + local_padding_size * 2) // scale)

                    num_pad = np.isnan(local).sum()
                    self.assertLessEqual(num_pad, local_padding_size)

                    self.assertTrue(not np.isnan(local[0])
                                    or not np.isnan(local[-1]))

                    wave_as_local = wave.reshape(int(sampling_length // scale),
                                                 -1).min(axis=1)
                    pad = int(local_padding_size // scale)
                    local_wo_pad = local[pad:-pad]
                    self.assertTrue(np.all(wave_as_local == local_wo_pad))
Esempio n. 22
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        local_data: SamplingData,
        local_padding_length: int,
        min_not_silence_length: int,
        f0_index: int,
        volume_index: Optional[int],
        harmonic_num: int,
        only_noise_source: bool,
        padding_value=0,
    ):
        """
        :return:
            wave: (sampling_length, )
            silence: (sampling_length, )
            local: (sampling_length // scale + pad, )
        """
        sr = wave_data.sampling_rate
        sl = sampling_length

        assert sr % local_data.rate == 0
        l_scale = int(sr // local_data.rate)

        length = len(local_data.array) * l_scale
        assert (abs(length - len(wave_data.wave)) <
                l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}"

        assert local_padding_length % l_scale == 0
        l_pad = local_padding_length // l_scale

        l_length = length // l_scale
        l_sl = sl // l_scale

        for _ in range(10000):
            if l_length > l_sl:
                l_offset = numpy.random.randint(l_length - l_sl)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(
                silence_data.resample(sr, index=offset, length=sl))
            if (~silence).sum() >= min_not_silence_length:
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]

        # local
        l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad
        if l_start < 0 or l_end > l_length:
            shape = list(local_data.array.shape)
            shape[0] = l_sl + l_pad * 2
            local = (numpy.ones(shape=shape, dtype=local_data.array.dtype) *
                     padding_value)
            if l_start < 0:
                p_start = -l_start
                l_start = 0
            else:
                p_start = 0
            if l_end > l_length:
                p_end = l_sl + l_pad * 2 - (l_end - l_length)
                l_end = l_length
            else:
                p_end = l_sl + l_pad * 2
            local[p_start:p_end] = local_data.array[l_start:l_end]
        else:
            local = local_data.array[l_start:l_end]

        # source module
        if l_pad > 0:
            log_f0 = local[l_pad:-l_pad, f0_index]
        else:
            log_f0 = local[:, f0_index]

        if only_noise_source:
            log_f0 = numpy.zeros_like(log_f0)

        volume = None
        if volume_index is not None:
            if l_pad > 0:
                volume = local[l_pad:-l_pad, volume_index]
            else:
                volume = local[:, volume_index]

        source, signal = generate_source(
            log_f0=log_f0,
            volume=volume,
            local_rate=int(local_data.rate),
            sampling_rate=sr,
            harmonic_num=harmonic_num,
        )
        source2, _ = generate_source(
            log_f0=log_f0,
            volume=volume,
            local_rate=int(local_data.rate),
            sampling_rate=sr,
            harmonic_num=harmonic_num,
        )

        return dict(
            wave=wave,
            silence=silence,
            local=local,
            source=source,
            source2=source2,
            signal=signal,
        )
Esempio n. 23
0
    def test_convert_to_dict(self):
        sampling_rate = 800
        local_sampling_rate = 200
        scale = sampling_rate // local_sampling_rate
        time_length = 10
        sampling_length = 16

        wave_data = Wave(
            wave=np.linspace(0,
                             sampling_rate * time_length,
                             sampling_rate * time_length,
                             endpoint=False),
            sampling_rate=sampling_rate,
        )
        silence_data = SamplingData(
            array=np.zeros((sampling_rate * time_length, ), dtype=bool),
            rate=sampling_rate,
        )
        local_data = SamplingData(
            array=np.linspace(0,
                              sampling_rate * time_length,
                              local_sampling_rate * time_length,
                              endpoint=False),
            rate=local_sampling_rate,
        )

        wave, silence, local = BaseWaveDataset.extract_input(
            sampling_length,
            wave_data=wave_data,
            silence_data=silence_data,
            local_data=local_data,
            local_padding_size=0,
        )

        dataset = BaseWaveDataset(
            sampling_length=sampling_length,
            to_double=True,
            bit=16,
            mulaw=False,
            local_padding_size=0,
        )
        d = dataset.convert_to_dict(wave, silence, local)
        self.assertEqual(len(d['coarse']), sampling_length)
        self.assertEqual(len(d['fine']), sampling_length - 1)
        self.assertEqual(len(d['encoded_coarse']), sampling_length)
        self.assertEqual(len(d['encoded_fine']), sampling_length)
        self.assertEqual(len(d['silence']), sampling_length - 1)
        self.assertEqual(len(d['local']), sampling_length // scale)

        dataset = BaseWaveDataset(
            sampling_length=sampling_length,
            to_double=False,
            bit=10,
            mulaw=False,
            local_padding_size=0,
        )
        d = dataset.convert_to_dict(wave, silence, local)
        self.assertEqual(len(d['coarse']), sampling_length)
        self.assertIsNone(d['fine'])
        self.assertEqual(len(d['encoded_coarse']), sampling_length)
        self.assertIsNone(d['encoded_fine'])
        self.assertEqual(len(d['silence']), sampling_length - 1)
        self.assertEqual(len(d['local']), sampling_length // scale)
Esempio n. 24
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        local_data: SamplingData,
        local_sampling_rate: Optional[int],
        local_padding_size: int,
        local_mask_max_second: float,
        local_mask_num: int,
        padding_value=0,
    ):
        """
        :return:
            wave: (sampling_length, )
            silence: (sampling_length, )
            local: (sampling_length // scale + pad, )
        """
        sr = wave_data.sampling_rate
        sl = sampling_length

        if local_sampling_rate is None:
            l_rate = local_data.rate
            l_array = local_data.array
        else:
            l_rate = local_sampling_rate
            l_array = local_data.resample(l_rate)

        l_scale = int(round(sr / l_rate))

        length = min(len(l_array) * l_scale, len(wave_data.wave))
        assert abs(length - len(l_array) * l_scale) < l_scale * 4
        assert abs(length - len(wave_data.wave)) < l_scale * 4

        assert (
            local_padding_size % l_scale == 0
        ), f"local_padding_size: {local_padding_size}, l_scale: {l_scale}"
        l_pad = local_padding_size // l_scale

        l_length = length // l_scale
        l_sl = sl // l_scale

        for _ in range(10000):
            if l_length > l_sl + 1:
                l_offset = numpy.random.randint(l_length - l_sl + 1)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(silence_data.resample(sr, index=offset, length=sl))
            if not silence.all():
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset : offset + sl]

        # local
        l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad
        if l_start < 0 or l_end > l_length:
            shape = list(l_array.shape)
            shape[0] = l_sl + l_pad * 2
            local = numpy.ones(shape=shape, dtype=l_array.dtype) * padding_value
            if l_start < 0:
                p_start = -l_start
                l_start = 0
            else:
                p_start = 0
            if l_end > l_length:
                p_end = l_sl + l_pad * 2 - (l_end - l_length)
                l_end = l_length
            else:
                p_end = l_sl + l_pad * 2
            local[p_start:p_end] = l_array[l_start:l_end]
        else:
            local = l_array[l_start:l_end]

        if local_mask_max_second > 0 and local_mask_num > 0:
            for _ in range(local_mask_num):
                mask_length = numpy.random.randint(int(l_rate * local_mask_max_second))
                mask_offset = numpy.random.randint(len(local) - mask_length + 1)
                local[mask_offset : mask_offset + mask_length] = 0

        return wave, silence, local
Esempio n. 25
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         local=SamplingData.load(self.path_local),
     )
Esempio n. 26
0
 def generate(self):
     return InputData(
         spectrogram=SamplingData.load(str(TempCache(
             self.spectrogram_path))),
         silence=SamplingData.load(str(TempCache(self.silence_path))),
     )
Esempio n. 27
0
def create_data(
    f0_dir: Path,
    phoneme_list_dir: Path,
    loudness_dir: Path,
    accent_start_dir: Path,
    accent_end_dir: Path,
    accent_phrase_start_dir: Path,
    accent_phrase_end_dir: Path,
    speaker_valid_filter: Optional[str],
    utterance_valid_filter: Optional[str],
    data_num: Optional[int],
):
    f0_paths = sorted(f0_dir.rglob("*.npy"))
    if data_num is not None:
        f0_paths = f0_paths[:data_num]
    assert len(f0_paths) > 0

    phoneme_list_paths = sorted(phoneme_list_dir.rglob("*.lab"))
    if data_num is not None:
        phoneme_list_paths = phoneme_list_paths[:data_num]
    assert len(f0_paths) == len(phoneme_list_paths)

    loudness_paths = sorted(loudness_dir.rglob("*.npy"))
    if data_num is not None:
        loudness_paths = loudness_paths[:data_num]
    assert len(f0_paths) == len(loudness_paths)

    accent_start_paths = sorted(accent_start_dir.rglob("*.txt"))
    if data_num is not None:
        accent_start_paths = accent_start_paths[:data_num]
    assert len(f0_paths) == len(accent_start_paths)

    accent_end_paths = sorted(accent_end_dir.rglob("*.txt"))
    if data_num is not None:
        accent_end_paths = accent_end_paths[:data_num]
    assert len(f0_paths) == len(accent_end_paths)

    accent_phrase_start_paths = sorted(accent_phrase_start_dir.rglob("*.txt"))
    if data_num is not None:
        accent_phrase_start_paths = accent_phrase_start_paths[:data_num]
    assert len(f0_paths) == len(accent_phrase_start_paths)

    accent_phrase_end_paths = sorted(accent_phrase_end_dir.rglob("*.txt"))
    if data_num is not None:
        accent_phrase_end_paths = accent_phrase_end_paths[:data_num]
    assert len(f0_paths) == len(accent_phrase_end_paths)

    datas = [
        InputData(
            name=f0_path.stem,
            f0=SamplingData.load(f0_path),
            phoneme_list=JvsPhoneme.load_julius_list(phoneme_list_path),
            loudness=SamplingData.load(loudness_path),
            accent_start=[
                bool(int(s)) for s in accent_start_path.read_text().split()
            ],
            accent_end=[
                bool(int(s)) for s in accent_end_path.read_text().split()
            ],
            accent_phrase_start=[
                bool(int(s))
                for s in accent_phrase_start_path.read_text().split()
            ],
            accent_phrase_end=[
                bool(int(s))
                for s in accent_phrase_end_path.read_text().split()
            ],
        ) for (
            f0_path,
            phoneme_list_path,
            loudness_path,
            accent_start_path,
            accent_end_path,
            accent_phrase_start_path,
            accent_phrase_end_path,
        ) in zip(
            f0_paths,
            phoneme_list_paths,
            loudness_paths,
            accent_start_paths,
            accent_end_paths,
            accent_phrase_start_paths,
            accent_phrase_end_paths,
        )
    ]

    train_datas: List[InputData] = []
    valid_datas: List[InputData] = []
    for d in datas:
        if (speaker_valid_filter is not None and speaker_valid_filter
                in d.name) or (utterance_valid_filter is not None
                               and utterance_valid_filter in d.name):
            valid_datas.append(d)
        else:
            train_datas.append(d)

    return train_datas, valid_datas
Esempio n. 28
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        f0_data: SamplingData,
        phoneme_data: SamplingData,
        min_not_silence_length: int,
        with_mic_augment: bool,
        time_mask_max_second: float,
        time_mask_num: int,
    ):
        rate = wave_data.sampling_rate
        sl = sampling_length

        l_rate = max(f0_data.rate, phoneme_data.rate)

        assert rate % l_rate == 0
        l_scale = int(rate // l_rate)

        assert sl % l_scale == 0

        local = SamplingData.collect([f0_data, phoneme_data],
                                     rate=l_rate,
                                     mode="min",
                                     error_time_length=0.015)
        f0_array = local[:, 0]
        phoneme_array = local[:, 1:]

        assert numpy.abs(len(local) * l_scale -
                         len(wave_data.wave)) < l_scale * 4

        length = min(
            len(local) * l_scale,
            len(wave_data.wave) // l_scale * l_scale)

        if sl > length:
            pad = sl - length
            sl = length
        else:
            pad = 0

        l_length = length // l_scale
        l_sl = sl // l_scale
        l_pad = pad // l_scale

        for _ in range(10000):
            if l_length > l_sl:
                l_offset = numpy.random.randint(l_length - l_sl)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(
                silence_data.resample(rate, index=offset, length=sl))
            if (~silence).sum() >= min_not_silence_length:
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]
        f0 = numpy.squeeze(f0_array[l_offset:l_offset + l_sl])
        phoneme = numpy.argmax(phoneme_array[l_offset:l_offset + l_sl], axis=1)
        padded = numpy.zeros_like(f0, dtype=bool)

        if l_pad > 0:
            l_pre = numpy.random.randint(l_pad + 1)
            l_post = l_pad - l_pre
            f0 = numpy.pad(f0, [l_pre, l_post])
            phoneme = numpy.pad(phoneme, [l_pre, l_post])
            padded = numpy.pad(padded, [l_pre, l_post], constant_values=True)

            pre, post = int(l_pre * l_scale), int(l_post * l_scale)
            wave = numpy.pad(wave, [pre, post])

        if with_mic_augment:
            wave = mic_augment(wave, sampling_rate=rate)

        if time_mask_max_second > 0 and time_mask_num > 0:
            for _ in range(time_mask_num):
                mask_length = numpy.random.randint(
                    int(wave_data.sampling_rate * time_mask_max_second))
                mask_offset = numpy.random.randint(len(wave) - mask_length + 1)
                wave[mask_offset:mask_offset + mask_length] = 0

        return dict(
            wave=wave,
            f0=f0,
            phoneme=phoneme,
            padded=padded,
        )
Esempio n. 29
0
def generate(
    model_dir: Path,
    model_iteration: Optional[int],
    model_config: Optional[Path],
    output_dir: Path,
    to_voiced_scaler: bool,
    to_f0_scaler: bool,
    to_phoneme_onehot: bool,
    batch_size: Optional[int],
    num_test: int,
    target_glob: Optional[str],
    use_gpu: bool,
):
    if model_config is None:
        model_config = model_dir / "config.yaml"

    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", generate, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    generator = Generator(
        config=config,
        predictor=_get_model_path(
            model_dir=model_dir,
            iteration=model_iteration,
            prefix="predictor_",
        ),
        voiced_network=(
            None
            if not to_voiced_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="voiced_network_",
            )
        ),
        f0_network=(
            None
            if not to_f0_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="f0_network_",
            )
        ),
        phoneme_network=(
            None
            if not to_phoneme_onehot
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="phoneme_network_",
            )
        ),
        use_gpu=use_gpu,
    )

    dataset = create_dataset(config.dataset)["test"]
    scale = numpy.prod(config.network.scale_list)

    if batch_size is None:
        batch_size = config.train.batch_size

    if isinstance(dataset, SpeakerWavesDataset):
        wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]]
    elif isinstance(dataset, WavesDataset):
        wave_paths = [data.path_wave for data in dataset.inputs[:num_test]]
    else:
        raise Exception()

    if target_glob is not None:
        wave_paths += list(map(Path, glob(target_glob)))

    for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"):
        waves = [Wave.load(p) for p in wps]
        arrays = [w.wave for w in waves]

        pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays]
        arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays]

        tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays]
        output = generator.generate(
            wave=concat_examples(tensors),
            to_voiced_scaler=to_voiced_scaler,
            to_f0_scaler=to_f0_scaler,
            to_phoneme_onehot=to_phoneme_onehot,
        )

        for feature, p, w, l in zip(output, wps, waves, pad_lengths):
            feature = feature.T[: l // scale]
            data = SamplingData(array=feature, rate=w.sampling_rate // scale)
            data.save(output_dir / (p.stem + ".npy"))
Esempio n. 30
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        local_data: SamplingData,
        local_sampling_rate: Optional[int],
        local_padding_length: int,
        min_not_silence_length: int,
        mulaw: bool,
        padding_value=0,
    ):
        """
        :return:
            wave: (sampling_length, )
            local: (sampling_length // scale + pad, )
        """
        sr = wave_data.sampling_rate
        sl = sampling_length

        if local_sampling_rate is None:
            l_rate = local_data.rate
            l_array = local_data.array
        else:
            l_rate = local_sampling_rate
            l_array = local_data.resample(l_rate)

        assert sr % l_rate == 0
        l_scale = int(sr // l_rate)

        length = len(l_array) * l_scale
        assert (abs(length - len(wave_data.wave)) <
                l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}"

        assert local_padding_length % l_scale == 0
        l_pad = local_padding_length // l_scale

        l_length = length // l_scale
        l_sl = sl // l_scale

        for _ in range(10000):
            if l_length > l_sl:
                l_offset = numpy.random.randint(l_length - l_sl)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(
                silence_data.resample(sr, index=offset, length=sl))
            if (~silence).sum() >= min_not_silence_length:
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]
        if mulaw:
            wave = encode_mulaw(wave)

        # local
        l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad
        if l_start < 0 or l_end > l_length:
            shape = list(l_array.shape)
            shape[0] = l_sl + l_pad * 2
            local = numpy.ones(shape=shape,
                               dtype=l_array.dtype) * padding_value
            if l_start < 0:
                p_start = -l_start
                l_start = 0
            else:
                p_start = 0
            if l_end > l_length:
                p_end = l_sl + l_pad * 2 - (l_end - l_length)
                l_end = l_length
            else:
                p_end = l_sl + l_pad * 2
            local[p_start:p_end] = l_array[l_start:l_end]
        else:
            local = l_array[l_start:l_end]

        return dict(
            wave=wave,
            local=local.T,  # (C, T)
        )