Ejemplo n.º 1
0
 def separate_effective(self,
                        wave: Wave,
                        feature: AcousticFeature,
                        threshold=None):
     """
     :return: (effective feature, effective flags)
     """
     hop, length = wave.get_hop_and_length(
         frame_period=self._param.frame_period)
     if threshold is None:
         if self._param.threshold_db is not None:
             effective = wave.get_effective_frame(
                 threshold_db=self._param.threshold_db,
                 fft_length=self._param.fft_length,
                 frame_period=self._param.frame_period,
             )
             feature = feature.indexing(effective)
         else:
             effective = numpy.ones(length, dtype=bool)
     else:
         mse = librosa.feature.rmse(y=wave.wave,
                                    frame_length=self._param.fft_length,
                                    hop_length=hop)**2
         effective = (librosa.core.power_to_db(mse.squeeze()) > -threshold)
         if len(effective) < len(feature.f0):  # the divide move
             effective = numpy.r_[effective, False]
         if len(effective) > len(feature.f0):  # the divide move
             effective = effective
         if len(effective) < len(feature.f0):  # the divide move
             effective = numpy.r_[effective, False]
         if len(effective) > len(feature.f0):  # the divide move
             effective = effective
         feature = feature.indexing(effective)
     return feature, effective
Ejemplo n.º 2
0
 def combine_silent(self, effective: numpy.ndarray, feature: AcousticFeature):
     sizes = AcousticFeature.get_sizes(
         sampling_rate=self._param.sampling_rate,
         order=self._param.order,
     )
     silent_feature = AcousticFeature.silent(len(effective), sizes=sizes, keys=('mc', 'ap', 'f0', 'voiced'))
     silent_feature.indexing_set(effective, feature)
     return silent_feature
Ejemplo n.º 3
0
    def get_example(self, i):
        train = chainer.config.train

        inputs = self.inputs[i]
        p_input, p_target, p_indexes = inputs.in_feature_path, inputs.out_feature_path, inputs.indexes_path

        indexes = AlignIndexes.load(p_indexes)

        # input feature
        f_in = AcousticFeature.load(p_input)
        f_in = f_in.indexing(indexes.indexes1)
        input = encode_feature(f_in, targets=self.config.in_features)

        # target feature
        f_tar = AcousticFeature.load(p_target)
        f_tar = f_tar.indexing(indexes.indexes2)
        target = encode_feature(f_tar, targets=self.config.out_features)

        mask = encode_feature(make_mask(f_tar),
                              targets=self.config.out_features)

        # padding
        seed = numpy.random.randint(2**31)
        input = random_pad(input,
                           seed=seed,
                           min_size=self.config.train_crop_size)
        target = random_pad(target,
                            seed=seed,
                            min_size=self.config.train_crop_size)
        mask = random_pad(mask,
                          seed=seed,
                          min_size=self.config.train_crop_size)

        # crop
        seed = numpy.random.randint(2**31)
        input = random_crop(input,
                            seed=seed,
                            crop_size=self.config.train_crop_size)
        target = random_crop(target,
                             seed=seed,
                             crop_size=self.config.train_crop_size)
        mask = random_crop(mask,
                           seed=seed,
                           crop_size=self.config.train_crop_size)

        if train:
            input = add_noise(input,
                              p_global=self.config.input_global_noise,
                              p_local=self.config.input_local_noise)
            target = add_noise(target,
                               p_global=self.config.target_global_noise,
                               p_local=self.config.target_local_noise)

        return dict(
            input=input,
            target=target,
            mask=mask,
        )
Ejemplo n.º 4
0
    def get_example(self, i):
        train = chainer.config.train

        p_x = self.x_paths[numpy.random.randint(len(self.x_paths))]
        p_y = self.y_paths[numpy.random.randint(len(self.y_paths))]

        f_x = AcousticFeature.load(p_x)
        x = encode_feature(f_x, targets=self.config.in_features)

        f_y = AcousticFeature.load(p_y)
        y = encode_feature(f_y, targets=self.config.out_features)

        mask_x = encode_feature(make_mask(f_x),
                                targets=self.config.in_features)
        mask_y = encode_feature(make_mask(f_y),
                                targets=self.config.out_features)

        # padding
        seed = numpy.random.randint(2**31)
        x = random_pad(x, seed=seed, min_size=self.config.train_crop_size)
        mask_x = random_pad(mask_x,
                            seed=seed,
                            min_size=self.config.train_crop_size)

        seed = numpy.random.randint(2**31)
        y = random_pad(y, seed=seed, min_size=self.config.train_crop_size)
        mask_y = random_pad(mask_y,
                            seed=seed,
                            min_size=self.config.train_crop_size)

        # crop
        seed = numpy.random.randint(2**31)
        x = random_crop(x, seed=seed, crop_size=self.config.train_crop_size)
        mask_x = random_crop(mask_x,
                             seed=seed,
                             crop_size=self.config.train_crop_size)

        seed = numpy.random.randint(2**31)
        y = random_crop(y, seed=seed, crop_size=self.config.train_crop_size)
        mask_y = random_crop(mask_y,
                             seed=seed,
                             crop_size=self.config.train_crop_size)

        if train:
            x = add_noise(x,
                          p_global=self.config.input_global_noise,
                          p_local=self.config.input_local_noise)
            y = add_noise(y,
                          p_global=self.config.target_global_noise,
                          p_local=self.config.target_local_noise)

        return dict(
            x=x,
            y=y,
            mask_x=mask_x,
            mask_y=mask_y,
        )
Ejemplo n.º 5
0
def generate_align_indexes(pair_path: Tuple[Path, Path]):
    path1, path2 = pair_path
    if path1.stem != path2.stem:
        print('warning: the file names are different', path1, path2)

    out = Path(arguments.output, path1.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    # original
    wave = Wave.load(path=path1, sampling_rate=sconf1.wav_fs)
    wave = wave.pad(pre_second=arguments.pad_second1,
                    post_second=arguments.pad_second1)
    x = low_cut_filter(wave.wave, wave.sampling_rate, cutoff=70)

    feat1.analyze(x)
    mcep = feat1.mcep(dim=sconf1.mcep_dim, alpha=sconf1.mcep_alpha)

    if arguments.threshold_db1 is not None:
        indexes = wave.get_effective_frame(
            threshold_db=arguments.threshold_db1,
            fft_length=sconf1.wav_fftl,
            frame_period=sconf1.wav_shiftms,
        )
        mcep = mcep[indexes]

    cvmcep_wopow = mcepgmm.convert(static_delta(mcep[:, 1:]),
                                   cvtype=pconf.GMM_mcep_cvtype)
    mcep1 = numpy.c_[mcep[:, 0], cvmcep_wopow]

    # target
    wave = Wave.load(path=path2, sampling_rate=sconf2.wav_fs)
    wave = wave.pad(pre_second=arguments.pad_second2,
                    post_second=arguments.pad_second2)
    x = low_cut_filter(wave.wave, wave.sampling_rate, cutoff=70)

    feat2.analyze(x)
    mcep2 = feat2.mcep(dim=sconf2.mcep_dim, alpha=sconf2.mcep_alpha)

    if arguments.threshold_db2 is not None:
        indexes = wave.get_effective_frame(
            threshold_db=arguments.threshold_db2,
            fft_length=sconf2.wav_fftl,
            frame_period=sconf2.wav_shiftms,
        )
        mcep2 = mcep2[indexes]

    # align
    feature1 = AcousticFeature(mc=mcep1)
    feature2 = AcousticFeature(mc=mcep2)
    align_indexes = AlignIndexes.extract(feature1,
                                         feature2,
                                         dtype=arguments.dtype)
    align_indexes.save(path=out,
                       validate=True,
                       ignores=arguments.ignore_feature)
Ejemplo n.º 6
0
def generate_align_indexes(pair_path: Tuple[Path, Path]):
    path1, path2 = pair_path
    if path1.stem != path2.stem:
        print('warning: the file names are different', path1, path2)

    out = Path(arguments.output, path1.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    feature1 = AcousticFeature.load(path=path1)
    feature2 = AcousticFeature.load(path=path2)

    align_indexes = AlignIndexes.extract(feature1, feature2, dtype=arguments.dtype)

    # save
    align_indexes.save(path=out, validate=True, ignores=arguments.ignore_feature)
Ejemplo n.º 7
0
    def convert(self, in_feature: AcousticFeature):
        im, iv = self.input_statistics.mean, self.input_statistics.var
        tm, tv = self.target_statistics.mean, self.target_statistics.var

        f0 = numpy.copy(in_feature.f0)
        f0[f0.nonzero()] = numpy.exp((tv / iv) * (numpy.log(f0[f0.nonzero()]) - im) + tm)
        return AcousticFeature(f0=f0)
Ejemplo n.º 8
0
def generate_feature(path: Path):
    out = Path(arguments.output, path.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    # load wave and padding
    wave = Wave.load(path=path, sampling_rate=arguments.sampling_rate)
    wave = wave.pad(pre_second=arguments.pad_second,
                    post_second=arguments.pad_second)

    # make acoustic feature
    feature = AcousticFeature.extract(
        wave=wave,
        frame_period=arguments.frame_period,
        f0_floor=arguments.f0_floor,
        f0_ceil=arguments.f0_ceil,
        fft_length=arguments.fft_length,
        order=arguments.order,
        alpha=arguments.alpha,
        dtype=arguments.dtype,
    )

    if arguments.threshold_db is not None:
        index = wave.get_effective_frame(
            threshold_db=arguments.threshold_db,
            fft_length=arguments.fft_length,
            frame_period=arguments.frame_period,
        )
        feature = feature.indexing(index)

    # save
    feature.save(path=out, validate=True, ignores=arguments.ignore_feature)
Ejemplo n.º 9
0
    def convert_loop(self, in_feature: AcousticFeature, n_len: int = 512, n_wrap: int = 128):
        out_feature_list: List[AcousticFeature] = []
        N = len(in_feature.f0)
        for i in numpy.arange(0, int(numpy.ceil(N / n_len))):
            # convert with overwrapped
            start = i * n_len
            mi = max(start - n_wrap, 0)
            ma = min(start + n_len + n_wrap, N)
            f = in_feature.indexing(numpy.arange(mi, ma))
            o_warp = self.convert(f)

            # eliminate overwrap
            ex_mi = start - mi
            ex_len = min(ma - start, n_len)
            o = o_warp.indexing(numpy.arange(ex_mi, ex_mi + ex_len))
            out_feature_list.append(o)
        return AcousticFeature.concatenate(out_feature_list)
Ejemplo n.º 10
0
 def decode_spectrogram(self, feature: AcousticFeature):
     fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate)
     feature.sp = pysptk.mc2sp(
         feature.mc.astype(numpy.float32),
         alpha=pysptk.util.mcepalpha(self.out_sampling_rate),
         fftlen=fftlen,
     )
     return feature
Ejemplo n.º 11
0
 def _decode_feature(self, data):
     sizes = AcousticFeature.get_sizes(
         sampling_rate=self._param.sampling_rate,
         order=self._param.order,
     )
     return decode_feature(data,
                           targets=self.config.dataset.out_features,
                           sizes=sizes)
Ejemplo n.º 12
0
def make_mask(feature: AcousticFeature):
    return AcousticFeature(
        f0=feature.voiced,
        sp=numpy.ones_like(feature.sp, dtype=numpy.bool),
        ap=numpy.ones_like(feature.ap, dtype=numpy.bool),
        coded_ap=numpy.ones_like(feature.coded_ap, dtype=numpy.bool),
        mc=numpy.ones_like(feature.mc, dtype=numpy.bool),
        voiced=numpy.ones_like(feature.voiced, dtype=numpy.bool),
    ).astype(numpy.float32)
Ejemplo n.º 13
0
def decode_feature(data: numpy.ndarray, targets: List[str], sizes: Dict[str, int]):
    data = data.T

    lasts = numpy.cumsum([sizes[t] for t in targets]).tolist()
    assert data.shape[1] == lasts[-1]

    return AcousticFeature(**{
        t: data[:, bef:aft]
        for t, bef, aft in zip(targets, [0] + lasts[:-1], lasts)
    })
Ejemplo n.º 14
0
 def extract_acoustic_feature(self, wave: Wave):
     return AcousticFeature.extract(
         wave,
         frame_period=self._param.frame_period,
         f0_floor=self._param.f0_floor,
         f0_ceil=self._param.f0_ceil,
         fft_length=self._param.fft_length,
         order=self._param.order,
         alpha=self._param.alpha,
         dtype=self._param.dtype,
     )
Ejemplo n.º 15
0
 def pad(self, width: int):
     sizes = AcousticFeature.get_sizes(
         sampling_rate=self.wave_sampling_rate, order=self.order)
     return AcousticFeatureWrapper.silent_wrapper(
         width,
         sizes=sizes,
         keys=self._keys,
         frame_period=self.frame_period,
         sampling_rate=self.wave_sampling_rate,
         wave_dtype=numpy.float32,
     ).astype_only_float_wrapper(numpy.float32)
Ejemplo n.º 16
0
    def post_convert(self, start_time: float, time_length: float):
        sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order)
        keys = ['f0', 'ap', 'sp', 'voiced']
        out_feature = self.fetch(
            start_time=start_time,
            time_length=time_length,
            data_stream=self._out_feature_stream,
            rate=1000 / self.frame_period,
            pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys),
            pick_function=lambda segment, first, last: segment.feature.pick(first, last, keys=keys),
            concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys),
        )

        out_wave = self.vocoder.decode(
            acoustic_feature=out_feature,
        )

        w = out_wave.wave
        w[numpy.isnan(w)] = 0
        out_wave = Wave(wave=w, sampling_rate=out_wave.sampling_rate)
        return out_wave
Ejemplo n.º 17
0
 def decode(
     self,
     acoustic_feature: AcousticFeature,
 ):
     acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
     out = pyworld.synthesize(
         f0=acoustic_feature.f0.ravel(),
         spectrogram=acoustic_feature.spectrogram,
         aperiodicity=acoustic_feature.aperiodicity,
         fs=self.out_sampling_rate,
         frame_period=self.acoustic_param.frame_period,
     )
     return Wave(out, sampling_rate=self.out_sampling_rate)
Ejemplo n.º 18
0
def generate_feature(path: Path):
    out = Path(arguments.output, path.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    # load wave and padding
    wave = Wave.load(path=path, sampling_rate=arguments.sampling_rate)
    wave = wave.pad(pre_second=arguments.pad_second,
                    post_second=arguments.pad_second)

    # make acoustic feature
    feature = AcousticFeature.extract(
        wave=wave,
        frame_period=arguments.frame_period,
        f0_floor=arguments.f0_floor,
        f0_ceil=arguments.f0_ceil,
        fft_length=arguments.fft_length,
        order=arguments.order,
        alpha=arguments.alpha,
        dtype=arguments.dtype,
    )

    if arguments.threshold_db is not None:
        if arguments.sampling_rate_for_thresholding is not None:
            wave_ref = Wave.load(
                path=path,
                sampling_rate=arguments.sampling_rate_for_thresholding)
            wave_ref = wave_ref.pad(pre_second=arguments.pad_second,
                                    post_second=arguments.pad_second)
        else:
            wave_ref = wave

        effective = wave_ref.get_effective_frame(
            threshold_db=arguments.threshold_db,
            fft_length=arguments.fft_length,
            frame_period=arguments.frame_period,
        )

        # there is possibility mismatch of length
        # https://github.com/mmorise/World/blob/c41e580c24c8d360f322ba6e2092ad4785d2d5b9/src/harvest.cpp#L1220
        len_wave = wave.get_hop_and_length(arguments.frame_period)[1]
        len_wave_ref = wave_ref.get_hop_and_length(arguments.frame_period)[1]
        if len_wave == len_wave_ref - 1:
            effective = effective[:-1]

        feature = feature.indexing(effective)

    # save
    feature.save(path=out, ignores=arguments.ignore_feature)
Ejemplo n.º 19
0
def generate_aligned_wave(
    pair_path: Tuple[Path, Path, Path],
    sampling_rate: int,
    frame_period: float,
    alpha: float,
):
    path_feature1, path_feature2, path_indexes = pair_path

    if path_feature1.stem != path_feature2.stem:
        print('warning: the file names are different', path_feature1,
              path_feature2)

    if path_feature1.stem != path_indexes.stem:
        print('warning: the file names are different', path_feature1,
              path_indexes)

    out = Path(arguments.output, path_indexes.stem + '.wav')
    if arguments.disable_overwrite:
        return

    feature1 = AcousticFeature.load(path=path_feature1)
    feature2 = AcousticFeature.load(path=path_feature2)
    feature1.sp = AcousticFeature.mc2sp(feature1.mc,
                                        sampling_rate=sampling_rate,
                                        alpha=alpha)
    feature2.sp = AcousticFeature.mc2sp(feature2.mc,
                                        sampling_rate=sampling_rate,
                                        alpha=alpha)
    feature1.ap = AcousticFeature.decode_ap(feature1.coded_ap,
                                            sampling_rate=sampling_rate)
    feature2.ap = AcousticFeature.decode_ap(feature2.coded_ap,
                                            sampling_rate=sampling_rate)

    align_indexes = AlignIndexes.load(path=path_indexes)
    align_indexes.feature1 = feature1
    align_indexes.feature2 = feature2

    wave1 = align_indexes.get_aligned_feature1().decode(
        sampling_rate=sampling_rate, frame_period=frame_period)
    wave2 = align_indexes.get_aligned_feature2().decode(
        sampling_rate=sampling_rate, frame_period=frame_period)

    # save
    y = numpy.vstack([wave1.wave, wave2.wave])
    librosa.output.write_wav(str(out), y, sr=sampling_rate)
Ejemplo n.º 20
0
    def convert(self, start_time: float, time_length: float, extra_time: float):
        sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order)
        keys = ['f0', 'ap', 'mc', 'voiced']

        def _pad_function(length):
            return AcousticFeatureWrapper.silent_wrapper(
                length,
                sizes=sizes,
                keys=keys,
                frame_period=self.frame_period,
                sampling_rate=self.sampling_rate,
                wave_dtype=self.in_dtype,
            ).astype_only_float_wrapper(self.in_dtype)

        def _pick_function(segment: FeatureWrapperSegment, first, last):
            return segment.feature.pick_wrapper(
                first,
                last,
                keys=keys,
                frame_period=self.frame_period,
            )

        in_feature = self.fetch(
            start_time=start_time,
            time_length=time_length,
            extra_time=extra_time,
            data_stream=self._in_feature_stream,
            rate=1000 / self.frame_period,
            pad_function=_pad_function,
            pick_function=_pick_function,
            concat_function=lambda buffers: AcousticFeatureWrapper.concatenate_wrapper(buffers, keys=keys),
        )
        out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature)

        pad = round(extra_time * 1000 / self.frame_period)
        out_feature = out_feature.pick(pad, -pad, keys=['f0', 'ap', 'sp', 'voiced'])
        return out_feature
Ejemplo n.º 21
0
def load_f0(path: Path):
    feature = AcousticFeature.load(path=path)
    return feature.f0
Ejemplo n.º 22
0
 def concat(self, datas: Iterable[AcousticFeatureWrapper]):
     return AcousticFeature.concatenate(list(datas), keys=self._keys)
Ejemplo n.º 23
0
 def pad(self, width: int):
     sizes = AcousticFeature.get_sizes(
         sampling_rate=self.wave_sampling_rate, order=self.order)
     return AcousticFeature.silent(width, sizes=sizes, keys=self._keys)
Ejemplo n.º 24
0
 def load_acoustic_feature(self, path: Path):
     return AcousticFeature.load(path)