Ejemplo n.º 1
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        local_data: SamplingData,
        local_sampling_rate: Optional[int],
        local_padding_size: int,
        local_mask_max_second: float,
        local_mask_num: int,
        padding_value=0,
    ):
        """
        :return:
            wave: (sampling_length, )
            silence: (sampling_length, )
            local: (sampling_length // scale + pad, )
        """
        sr = wave_data.sampling_rate
        sl = sampling_length

        if local_sampling_rate is None:
            l_rate = local_data.rate
            l_array = local_data.array
        else:
            l_rate = local_sampling_rate
            l_array = local_data.resample(l_rate)

        l_scale = int(round(sr / l_rate))

        length = min(len(l_array) * l_scale, len(wave_data.wave))
        assert abs(length - len(l_array) * l_scale) < l_scale * 4
        assert abs(length - len(wave_data.wave)) < l_scale * 4

        assert (
            local_padding_size % l_scale == 0
        ), f"local_padding_size: {local_padding_size}, l_scale: {l_scale}"
        l_pad = local_padding_size // l_scale

        l_length = length // l_scale
        l_sl = sl // l_scale

        for _ in range(10000):
            if l_length > l_sl + 1:
                l_offset = numpy.random.randint(l_length - l_sl + 1)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(silence_data.resample(sr, index=offset, length=sl))
            if not silence.all():
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset : offset + sl]

        # local
        l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad
        if l_start < 0 or l_end > l_length:
            shape = list(l_array.shape)
            shape[0] = l_sl + l_pad * 2
            local = numpy.ones(shape=shape, dtype=l_array.dtype) * padding_value
            if l_start < 0:
                p_start = -l_start
                l_start = 0
            else:
                p_start = 0
            if l_end > l_length:
                p_end = l_sl + l_pad * 2 - (l_end - l_length)
                l_end = l_length
            else:
                p_end = l_sl + l_pad * 2
            local[p_start:p_end] = l_array[l_start:l_end]
        else:
            local = l_array[l_start:l_end]

        if local_mask_max_second > 0 and local_mask_num > 0:
            for _ in range(local_mask_num):
                mask_length = numpy.random.randint(int(l_rate * local_mask_max_second))
                mask_offset = numpy.random.randint(len(local) - mask_length + 1)
                local[mask_offset : mask_offset + mask_length] = 0

        return wave, silence, local
Ejemplo n.º 2
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        local_data: SamplingData,
        local_sampling_rate: Optional[int],
        local_padding_length: int,
        min_not_silence_length: int,
        mulaw: bool,
        padding_value=0,
    ):
        """
        :return:
            wave: (sampling_length, )
            local: (sampling_length // scale + pad, )
        """
        sr = wave_data.sampling_rate
        sl = sampling_length

        if local_sampling_rate is None:
            l_rate = local_data.rate
            l_array = local_data.array
        else:
            l_rate = local_sampling_rate
            l_array = local_data.resample(l_rate)

        assert sr % l_rate == 0
        l_scale = int(sr // l_rate)

        length = len(l_array) * l_scale
        assert (abs(length - len(wave_data.wave)) <
                l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}"

        assert local_padding_length % l_scale == 0
        l_pad = local_padding_length // l_scale

        l_length = length // l_scale
        l_sl = sl // l_scale

        for _ in range(10000):
            if l_length > l_sl:
                l_offset = numpy.random.randint(l_length - l_sl)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(
                silence_data.resample(sr, index=offset, length=sl))
            if (~silence).sum() >= min_not_silence_length:
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]
        if mulaw:
            wave = encode_mulaw(wave)

        # local
        l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad
        if l_start < 0 or l_end > l_length:
            shape = list(l_array.shape)
            shape[0] = l_sl + l_pad * 2
            local = numpy.ones(shape=shape,
                               dtype=l_array.dtype) * padding_value
            if l_start < 0:
                p_start = -l_start
                l_start = 0
            else:
                p_start = 0
            if l_end > l_length:
                p_end = l_sl + l_pad * 2 - (l_end - l_length)
                l_end = l_length
            else:
                p_end = l_sl + l_pad * 2
            local[p_start:p_end] = l_array[l_start:l_end]
        else:
            local = l_array[l_start:l_end]

        return dict(
            wave=wave,
            local=local.T,  # (C, T)
        )
Ejemplo n.º 3
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        f0_data: SamplingData,
        phoneme_data: SamplingData,
        min_not_silence_length: int,
        with_mic_augment: bool,
        time_mask_max_second: float,
        time_mask_num: int,
    ):
        rate = wave_data.sampling_rate
        sl = sampling_length

        l_rate = max(f0_data.rate, phoneme_data.rate)

        assert rate % l_rate == 0
        l_scale = int(rate // l_rate)

        assert sl % l_scale == 0

        local = SamplingData.collect([f0_data, phoneme_data],
                                     rate=l_rate,
                                     mode="min",
                                     error_time_length=0.015)
        f0_array = local[:, 0]
        phoneme_array = local[:, 1:]

        assert numpy.abs(len(local) * l_scale -
                         len(wave_data.wave)) < l_scale * 4

        length = min(
            len(local) * l_scale,
            len(wave_data.wave) // l_scale * l_scale)

        if sl > length:
            pad = sl - length
            sl = length
        else:
            pad = 0

        l_length = length // l_scale
        l_sl = sl // l_scale
        l_pad = pad // l_scale

        for _ in range(10000):
            if l_length > l_sl:
                l_offset = numpy.random.randint(l_length - l_sl)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(
                silence_data.resample(rate, index=offset, length=sl))
            if (~silence).sum() >= min_not_silence_length:
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]
        f0 = numpy.squeeze(f0_array[l_offset:l_offset + l_sl])
        phoneme = numpy.argmax(phoneme_array[l_offset:l_offset + l_sl], axis=1)
        padded = numpy.zeros_like(f0, dtype=bool)

        if l_pad > 0:
            l_pre = numpy.random.randint(l_pad + 1)
            l_post = l_pad - l_pre
            f0 = numpy.pad(f0, [l_pre, l_post])
            phoneme = numpy.pad(phoneme, [l_pre, l_post])
            padded = numpy.pad(padded, [l_pre, l_post], constant_values=True)

            pre, post = int(l_pre * l_scale), int(l_post * l_scale)
            wave = numpy.pad(wave, [pre, post])

        if with_mic_augment:
            wave = mic_augment(wave, sampling_rate=rate)

        if time_mask_max_second > 0 and time_mask_num > 0:
            for _ in range(time_mask_num):
                mask_length = numpy.random.randint(
                    int(wave_data.sampling_rate * time_mask_max_second))
                mask_offset = numpy.random.randint(len(wave) - mask_length + 1)
                wave[mask_offset:mask_offset + mask_length] = 0

        return dict(
            wave=wave,
            f0=f0,
            phoneme=phoneme,
            padded=padded,
        )
Ejemplo n.º 4
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        local_data: SamplingData,
        local_padding_length: int,
        min_not_silence_length: int,
        f0_index: int,
        volume_index: Optional[int],
        harmonic_num: int,
        only_noise_source: bool,
        padding_value=0,
    ):
        """
        :return:
            wave: (sampling_length, )
            silence: (sampling_length, )
            local: (sampling_length // scale + pad, )
        """
        sr = wave_data.sampling_rate
        sl = sampling_length

        assert sr % local_data.rate == 0
        l_scale = int(sr // local_data.rate)

        length = len(local_data.array) * l_scale
        assert (abs(length - len(wave_data.wave)) <
                l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}"

        assert local_padding_length % l_scale == 0
        l_pad = local_padding_length // l_scale

        l_length = length // l_scale
        l_sl = sl // l_scale

        for _ in range(10000):
            if l_length > l_sl:
                l_offset = numpy.random.randint(l_length - l_sl)
            else:
                l_offset = 0
            offset = l_offset * l_scale

            silence = numpy.squeeze(
                silence_data.resample(sr, index=offset, length=sl))
            if (~silence).sum() >= min_not_silence_length:
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]

        # local
        l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad
        if l_start < 0 or l_end > l_length:
            shape = list(local_data.array.shape)
            shape[0] = l_sl + l_pad * 2
            local = (numpy.ones(shape=shape, dtype=local_data.array.dtype) *
                     padding_value)
            if l_start < 0:
                p_start = -l_start
                l_start = 0
            else:
                p_start = 0
            if l_end > l_length:
                p_end = l_sl + l_pad * 2 - (l_end - l_length)
                l_end = l_length
            else:
                p_end = l_sl + l_pad * 2
            local[p_start:p_end] = local_data.array[l_start:l_end]
        else:
            local = local_data.array[l_start:l_end]

        # source module
        if l_pad > 0:
            log_f0 = local[l_pad:-l_pad, f0_index]
        else:
            log_f0 = local[:, f0_index]

        if only_noise_source:
            log_f0 = numpy.zeros_like(log_f0)

        volume = None
        if volume_index is not None:
            if l_pad > 0:
                volume = local[l_pad:-l_pad, volume_index]
            else:
                volume = local[:, volume_index]

        source, signal = generate_source(
            log_f0=log_f0,
            volume=volume,
            local_rate=int(local_data.rate),
            sampling_rate=sr,
            harmonic_num=harmonic_num,
        )
        source2, _ = generate_source(
            log_f0=log_f0,
            volume=volume,
            local_rate=int(local_data.rate),
            sampling_rate=sr,
            harmonic_num=harmonic_num,
        )

        return dict(
            wave=wave,
            silence=silence,
            local=local,
            source=source,
            source2=source2,
            signal=signal,
        )
Ejemplo n.º 5
0
    def extract_input(
        sampling_length: int,
        f0_data: SamplingData,
        phoneme_data: SamplingData,
        spec_data: SamplingData,
        silence_data: SamplingData,
        phoneme_list_data: Optional[List[BasePhoneme]],
        volume_data: Optional[SamplingData],
        f0_process_mode: F0ProcessMode,
        time_mask_max_second: float,
        time_mask_num: int,
    ):
        rate = spec_data.rate

        f0 = f0_data.resample(rate)
        phoneme = phoneme_data.resample(rate)
        silence = silence_data.resample(rate)
        volume = volume_data.resample(
            rate) if volume_data is not None else None
        spec = spec_data.array

        assert numpy.abs(len(spec) - len(f0)) < 5
        assert numpy.abs(len(spec) - len(phoneme)) < 5
        assert numpy.abs(len(spec) - len(silence)) < 5
        assert volume is None or numpy.abs(len(spec) - len(silence)) < 5

        length = min(len(spec), len(f0), len(phoneme), len(silence))
        if volume is not None:
            length = min(length, len(volume))

        if f0_process_mode == F0ProcessMode.normal:
            pass
        else:
            assert phoneme_list_data is not None
            weight = volume

            if f0_process_mode == F0ProcessMode.phoneme_mean:
                split_second_list = [p.end for p in phoneme_list_data[:-1]]
            else:
                split_second_list = [
                    p.end for p in phoneme_list_data[:-1]
                    if p.phoneme in mora_phoneme_list
                ]

            if f0_process_mode == F0ProcessMode.voiced_mora_mean:
                if weight is None:
                    weight = numpy.ones_like(f0)

                for p in phoneme_list_data:
                    if p.phoneme not in voiced_phoneme_list:
                        weight[int(p.start * rate):int(p.end * rate)] = 0

            f0 = f0[:length]
            weight = weight[:length]

            f0 = f0_mean(
                f0=f0,
                rate=rate,
                split_second_list=split_second_list,
                weight=weight,
            )

        if sampling_length > length:
            padding_length = sampling_length - length
            sampling_length = length
        else:
            padding_length = 0

        for _ in range(10000):
            if length > sampling_length + 1:
                offset = numpy.random.randint(length - sampling_length + 1)
            else:
                offset = 0
            s = numpy.squeeze(silence[offset:offset + sampling_length])
            if not s.all():
                break
        else:
            raise Exception("cannot pick not silence data")

        if silence.ndim == 2:
            silence = numpy.squeeze(silence, axis=1)

        f0 = f0[offset:offset + sampling_length]
        phoneme = phoneme[offset:offset + sampling_length]
        spec = spec[offset:offset + sampling_length]
        silence = silence[offset:offset + sampling_length]
        padded = numpy.zeros_like(silence)

        if padding_length > 0:
            pre = numpy.random.randint(padding_length + 1)
            post = padding_length - pre
            f0 = numpy.pad(f0, [[pre, post], [0, 0]])
            phoneme = numpy.pad(phoneme, [[pre, post], [0, 0]])
            spec = numpy.pad(spec, [[pre, post], [0, 0]])
            silence = numpy.pad(silence, [pre, post], constant_values=True)
            padded = numpy.pad(padded, [pre, post], constant_values=True)

        if time_mask_max_second > 0 and time_mask_num > 0:
            for _ in range(time_mask_num):
                mask_length = numpy.random.randint(
                    int(rate * time_mask_max_second))
                mask_offset = numpy.random.randint(len(f0) - mask_length + 1)
                f0[mask_offset:mask_offset + mask_length] = 0
                phoneme[mask_offset:mask_offset + mask_length] = 0

        return dict(
            f0=f0.astype(numpy.float32),
            phoneme=phoneme.astype(numpy.float32),
            spec=spec.astype(numpy.float32),
            silence=silence,
            padded=padded,
        )
Ejemplo n.º 6
0
    def extract_input(
        sampling_length: int,
        wave_data: Wave,
        silence_data: SamplingData,
        local_data: SamplingData,
        local_padding_size: int,
        padding_value=0,
    ):
        """
        :return:
            wave: (sampling_length, )
            silence: (sampling_length, )
            local: (sampling_length // scale + pad, )
        """
        sr = wave_data.sampling_rate
        sl = sampling_length

        assert sr % local_data.rate == 0
        l_scale = int(sr // local_data.rate)

        length = len(local_data.array) * l_scale
        assert (abs(length - len(wave_data.wave)) <
                l_scale * 4), f"{abs(length - len(wave_data.wave))} {l_scale}"

        assert local_padding_size % l_scale == 0
        l_pad = local_padding_size // l_scale

        l_length = length // l_scale
        l_sl = sl // l_scale

        for _ in range(10000):
            l_offset = np.random.randint(l_length - l_sl)
            offset = l_offset * l_scale

            silence = np.squeeze(
                silence_data.resample(sr, index=offset, length=sl))
            if not silence.all():
                break
        else:
            raise Exception("cannot pick not silence data")

        wave = wave_data.wave[offset:offset + sl]

        # local
        l_start, l_end = l_offset - l_pad, l_offset + l_sl + l_pad
        if l_start < 0 or l_end > l_length:
            shape = list(local_data.array.shape)
            shape[0] = l_sl + l_pad * 2
            local = np.ones(shape=shape,
                            dtype=local_data.array.dtype) * padding_value
            if l_start < 0:
                p_start = -l_start
                l_start = 0
            else:
                p_start = 0
            if l_end > l_length:
                p_end = l_sl + l_pad * 2 - (l_end - l_length)
                l_end = l_length
            else:
                p_end = l_sl + l_pad * 2
            local[p_start:p_end] = local_data.array[l_start:l_end]
        else:
            local = local_data.array[l_start:l_end]

        return wave, silence, local