Esempio n. 1
0
def main():
    save_arguments(arguments, output / 'arguments.json')

    # f0 converter
    if f0_trans_model_dir is not None:
        model = _get_predictor_model_path(f0_trans_model_dir,
                                          f0_trans_model_iteration)
        f0_converter = AcousticConverter(create_config(f0_trans_config),
                                         model,
                                         gpu=gpu)
    elif input_statistics is not None:
        f0_converter = F0Converter(input_statistics=input_statistics,
                                   target_statistics=target_statistics)
    else:
        f0_converter = None

    # acoustic converter
    config = create_config(voice_changer_config)
    model = _get_predictor_model_path(voice_changer_model_dir,
                                      voice_changer_model_iteration)
    acoustic_converter = AcousticConverter(
        config,
        model,
        gpu=gpu,
        f0_converter=f0_converter,
        out_sampling_rate=arguments.out_sampling_rate,
    )
    print(f'Loaded acoustic converter model "{model}"')

    # super resolution
    sr_config = create_sr_config(super_resolution_config)
    super_resolution = SuperResolution(sr_config,
                                       super_resolution_model,
                                       gpu=gpu)
    print(f'Loaded super resolution model "{super_resolution_model}"')

    # dataset's test
    if not disable_dataset_test:
        input_paths = list(
            sorted(
                [Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
        numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
        paths_test = input_paths[-config.dataset.num_test:]
    else:
        paths_test = []

    # test data
    if test_wave_dir is not None:
        paths_test += list(test_wave_dir.glob('*.wav'))

    process_partial = partial(process,
                              acoustic_converter=acoustic_converter,
                              super_resolution=super_resolution)
    if gpu is None:
        list(multiprocessing.Pool().map(process_partial, paths_test))
    else:
        list(map(process_partial, paths_test))
def convert_feature(path: Path, acoustic_converter: AcousticConverter):
    out = Path(arguments.output, path.stem + '.npy')
    if out.exists() and not arguments.enable_overwrite:
        return

    in_feature = acoustic_converter.load_acoustic_feature(path)
    out_feature = acoustic_converter.convert(in_feature)

    # save
    out_feature.save(path=out, ignores=arguments.ignore_feature)
Esempio n. 3
0
def main():
    if arguments.voice_changer_model_iteration is None:
        paths = voice_changer_model_dir.glob('predictor_*.npz')
        voice_changer_model = list(sorted(paths, key=_extract_number))[-1]
    else:
        voice_changer_model = voice_changer_model_dir / 'predictor_{}.npz'.format(
            arguments.voice_changer_model_iteration)

    config = create_config(arguments.voice_changer_config)
    acoustic_converter = AcousticConverter(config,
                                           voice_changer_model,
                                           gpu=arguments.gpu)

    sr_config = create_sr_config(arguments.super_resolution_config)
    super_resolution = SuperResolution(sr_config,
                                       super_resolution_model,
                                       gpu=arguments.gpu)

    # test data
    input_paths = list(
        sorted([Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
    numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
    paths_test = input_paths[-config.dataset.num_test:]

    process_partial = partial(process,
                              acoustic_converter=acoustic_converter,
                              super_resolution=super_resolution)
    if gpu is None:
        pool = multiprocessing.Pool()
        pool.map(process_partial, paths_test)
    else:
        list(map(process_partial, paths_test))
Esempio n. 4
0
    def __init__(
        self,
        voice_changer_model: Path,
        voice_changer_config: Path,
        super_resolution_model: Path,
        super_resolution_config: Path,
        input_statistics: Path,
        target_statistics: Path,
        gpu: int,
    ):
        # f0 converter
        if input_statistics is not None:
            f0_converter = F0Converter(input_statistics=input_statistics,
                                       target_statistics=target_statistics)
        else:
            f0_converter = None

        # acoustic converter
        config = create_config(voice_changer_config)
        acoustic_converter = AcousticConverter(
            config,
            voice_changer_model,
            gpu=gpu,
            f0_converter=f0_converter,
        )

        # super resolution
        sr_config = create_sr_config(super_resolution_config)
        super_resolution = SuperResolution(sr_config,
                                           super_resolution_model,
                                           gpu=gpu)

        self.acoustic_converter = acoustic_converter
        self.super_resolution = super_resolution
Esempio n. 5
0
    def _convert(p_in: Path, acoustic_converter: AcousticConverter,
                 super_resolution: SuperResolution):
        w_in = acoustic_converter.load_wave(p_in)
        f_in = acoustic_converter.extract_acoustic_feature(w_in)
        f_in_effective, effective = acoustic_converter.separate_effective(
            wave=w_in, feature=f_in)
        f_low = acoustic_converter.convert(f_in_effective)
        f_low = acoustic_converter.combine_silent(effective=effective,
                                                  feature=f_low)
        f_low = acoustic_converter.decode_spectrogram(f_low)
        s_high = super_resolution.convert(f_low.sp.astype(numpy.float32))

        f_low_sr = BYAcousticFeature(
            f0=f_low.f0,
            spectrogram=f_low.sp,
            aperiodicity=f_low.ap,
            mfcc=f_low.mc,
            voiced=f_low.voiced,
        )

        rate = acoustic_converter.out_sampling_rate
        wave = super_resolution(s_high,
                                acoustic_feature=f_low_sr,
                                sampling_rate=rate)
        return wave
Esempio n. 6
0
def process(p_in: Path, acoustic_converter: AcousticConverter):
    try:
        if p_in.suffix in ['.npy', '.npz']:
            p_in = Path(glob.glob(str(dataset_wave_dir / p_in.stem) + '.*')[0])

        # input wave
        w_in = acoustic_converter.load_wave(p_in)
        w_in.wave *= input_scale

        # input feature
        f_in = acoustic_converter.extract_acoustic_feature(w_in)
        f_in_effective, effective = acoustic_converter.separate_effective(
            wave=w_in, feature=f_in, threshold=threshold)

        # convert
        f_out = acoustic_converter.convert_loop(f_in_effective)
        f_out = acoustic_converter.combine_silent(effective=effective,
                                                  feature=f_out)
        f_out = acoustic_converter.decode_spectrogram(f_out)

        # save
        sampling_rate = acoustic_converter.out_sampling_rate
        frame_period = acoustic_converter.config.dataset.acoustic_param.frame_period
        wave = f_out.decode(sampling_rate=sampling_rate,
                            frame_period=frame_period)
        librosa.output.write_wav(y=wave.wave,
                                 path=str(output_dir / (p_in.stem + '.wav')),
                                 sr=wave.sampling_rate)
    except:
        import traceback
        traceback.print_exc()
Esempio n. 7
0
def main():
    arguments.output.mkdir(exist_ok=True)
    save_arguments(arguments, arguments.output / 'arguments.json')

    config = create_config(arguments.vc_config)
    acoustic_converter = AcousticConverter(config,
                                           arguments.vc_model,
                                           gpu=arguments.gpu)

    paths = [Path(p) for p in glob.glob(arguments.input_glob)]

    pool = multiprocessing.Pool()
    it = pool.imap(
        partial(convert_feature, acoustic_converter=acoustic_converter), paths)
    list(tqdm.tqdm(it, total=len(paths)))
Esempio n. 8
0
def main():
    save_arguments(arguments, output_dir / 'arguments.json')

    # f0 converter
    if input_statistics is not None:
        f0_converter = F0Converter(input_statistics=input_statistics,
                                   target_statistics=target_statistics)
    else:
        f0_converter = None

    # acoustic converter
    config = create_config(config_path)
    model = _get_predictor_model_path(model_dir, model_iteration)
    acoustic_converter = AcousticConverter(
        config,
        model,
        gpu=gpu,
        f0_converter=f0_converter,
        out_sampling_rate=output_sampling_rate,
    )
    print(f'Loaded acoustic converter model "{model}"')

    # dataset test
    if not disable_dataset_test:
        input_paths = list(
            sorted(
                [Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
        numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
        paths_test = input_paths[-config.dataset.num_test:]
    else:
        paths_test = []

    # additional test
    if test_wave_dir is not None:
        paths_test += list(test_wave_dir.glob('*.wav'))

    process_partial = partial(process, acoustic_converter=acoustic_converter)
    if gpu is None:
        list(multiprocessing.Pool().map(process_partial, paths_test))
    else:
        list(map(process_partial, paths_test))
Esempio n. 9
0
    def models(self):
        if self._models is None:
            f0_converter = F0Converter(
                input_statistics=self.input_statistics_path,
                target_statistics=self.target_statistics_path,
            )

            ac_config = self.ac_config
            sr_config = self.sr_config

            acoustic_converter = AcousticConverter(
                ac_config,
                self.stage1_model_path,
                f0_converter=f0_converter,
                out_sampling_rate=self.out_sampling_rate,
            )
            super_resolution = SuperResolution(
                sr_config,
                self.stage2_model_path,
            )
            self._models = acoustic_converter, super_resolution
        return self._models
Esempio n. 10
0
    def make_yukarin_converter(
        input_statistics_path: Path,
        target_statistics_path: Path,
        stage1_model_path: Path,
        stage1_config_path: Path,
        stage2_model_path: Path,
        stage2_config_path: Path,
    ):
        logger = logging.getLogger('encode')
        init_logger(logger)
        logger.info('make_yukarin_converter')

        f0_converter = F0Converter(
            input_statistics=input_statistics_path,
            target_statistics=target_statistics_path,
        )

        config = create_config(stage1_config_path)
        acoustic_converter = AcousticConverter(
            config=config,
            model_path=stage1_model_path,
            gpu=0,
            f0_converter=f0_converter,
            out_sampling_rate=24000,
        )
        logger.info('model 1 loaded!')

        sr_config = create_sr_config(stage2_config_path)
        super_resolution = SuperResolution(
            config=sr_config,
            model_path=stage2_model_path,
            gpu=0,
        )
        logger.info('model 2 loaded!')
        return YukarinConverter(
            acoustic_converter=acoustic_converter,
            super_resolution=super_resolution,
        )
Esempio n. 11
0
def check(
    input_path: Path,
    input_time_length: int,
    output_path: Path,
    input_statistics_path: Path,
    target_statistics_path: Path,
    stage1_model_path: Path,
    stage1_config_path: Path,
    stage2_model_path: Path,
    stage2_config_path: Path,
):
    ac_config = create_config(stage1_config_path)
    sr_config = create_sr_config(stage2_config_path)
    input_rate = ac_config.dataset.acoustic_param.sampling_rate
    output_rate = sr_config.dataset.param.voice_param.sample_rate

    realtime_vocoder = RealtimeVocoder(
        acoustic_param=ac_config.dataset.acoustic_param,
        out_sampling_rate=output_rate,
        extract_f0_mode=VocodeMode.WORLD,
    )
    realtime_vocoder.create_synthesizer(
        buffer_size=1024,
        number_of_pointers=16,
    )

    f0_converter = F0Converter(
        input_statistics=input_statistics_path,
        target_statistics=target_statistics_path,
    )

    ac_config = ac_config
    sr_config = sr_config

    acoustic_converter = AcousticConverter(
        ac_config,
        stage1_model_path,
        f0_converter=f0_converter,
        out_sampling_rate=output_rate,
    )
    super_resolution = SuperResolution(
        sr_config,
        stage2_model_path,
    )

    voice_changer = VoiceChanger(
        acoustic_converter=acoustic_converter,
        super_resolution=super_resolution,
        output_sampling_rate=output_rate,
    )

    encode_stream = EncodeStream(vocoder=realtime_vocoder)
    convert_stream = ConvertStream(voice_changer=voice_changer)
    decode_stream = DecodeStream(vocoder=realtime_vocoder)

    num_data = input_time_length
    time_length = 1

    def _load_wave_and_split(time_length: float = 1):
        length = round(time_length * input_rate)
        wave, _ = librosa.load(str(input_path), sr=input_rate)
        return [
            wave[i * length:(i + 1) * length]
            for i in range(len(wave) // length)
        ]

    def _add(_stream: BaseStream, _datas):
        for i, data in zip(range(num_data), _datas):
            _stream.add(start_time=i * time_length, data=data)

    def _split_process(_stream: BaseStream, _extra_time: float):
        return [
            _stream.process(start_time=i * time_length,
                            time_length=time_length,
                            extra_time=_extra_time) for i in range(num_data)
        ]

    def _join_process(_stream: BaseStream, _extra_time: float):
        return _stream.process(start_time=0,
                               time_length=time_length * num_data,
                               extra_time=_extra_time)

    def _process_all_stream(
        _streams: Tuple[BaseStream, BaseStream, BaseStream],
        _datas,
        _split_flags: Tuple[bool, bool, bool],
        _extra_times: Tuple[float, float, float],
    ):
        for stream, split_flag, extra_time in zip(_streams, _split_flags,
                                                  _extra_times):
            _add(stream, _datas)
            if split_flag:
                _datas = _split_process(stream, _extra_time=extra_time)
            else:
                _datas = [_join_process(stream, _extra_time=extra_time)]
        return _datas

    def _concat_and_save(_waves, _path: Path):
        wave = numpy.concatenate(_waves).astype(numpy.float32)
        librosa.output.write_wav(str(_path), wave, output_rate)

    def _remove(_streams: Tuple[BaseStream, BaseStream, BaseStream]):
        for stream in _streams:
            stream.remove(end_time=num_data)

    waves = _load_wave_and_split(time_length=time_length)[:num_data]
    encode_stream = encode_stream
    convert_stream = convert_stream
    decode_stream = decode_stream

    streams = (encode_stream, convert_stream, decode_stream)

    datas = _process_all_stream(streams,
                                waves,
                                _split_flags=(True, True, True),
                                _extra_times=(0, 1, 0))
    _concat_and_save(datas, output_path)
    _remove(streams)
Esempio n. 12
0
def process(p_in: Path, acoustic_converter: AcousticConverter,
            super_resolution: SuperResolution):
    try:
        if p_in.suffix in ['.npy', '.npz']:
            p_in = Path(
                glob.glob(str(dataset_input_wave_dir / p_in.stem) + '.*')[0])

        w_in = acoustic_converter.load_wave(p_in)
        f_in = acoustic_converter.extract_acoustic_feature(w_in)
        f_in_effective, effective = acoustic_converter.separate_effective(
            wave=w_in, feature=f_in)
        f_low = acoustic_converter.convert(f_in_effective)
        f_low = acoustic_converter.combine_silent(effective=effective,
                                                  feature=f_low)
        if filter_size is not None:
            f_low.f0 = AcousticConverter.filter_f0(f_low.f0,
                                                   filter_size=filter_size)
        f_low = acoustic_converter.decode_spectrogram(f_low)
        s_high = super_resolution.convert(f_low.sp.astype(numpy.float32))

        # target
        paths = glob.glob(str(dataset_target_wave_dir / p_in.stem) + '.*')
        has_true = len(paths) > 0
        if has_true:
            p_true = Path(paths[0])
            w_true = acoustic_converter.load_wave(p_true)
            f_true = acoustic_converter.extract_acoustic_feature(w_true)

        # save figure
        fig = plt.figure(figsize=[36, 22])

        plt.subplot(4, 1, 1)
        plt.imshow(numpy.log(f_in.sp).T, aspect='auto', origin='reverse')
        plt.plot(f_in.f0, 'w')
        plt.colorbar()

        plt.subplot(4, 1, 2)
        plt.imshow(numpy.log(f_low.sp).T, aspect='auto', origin='reverse')
        plt.plot(f_low.f0, 'w')
        plt.colorbar()

        plt.subplot(4, 1, 3)
        plt.imshow(numpy.log(s_high).T, aspect='auto', origin='reverse')
        plt.colorbar()

        if has_true:
            plt.subplot(4, 1, 4)
            plt.imshow(numpy.log(f_true.sp).T, aspect='auto', origin='reverse')
            plt.plot(f_true.f0, 'w')
            plt.colorbar()

        fig.savefig(output / (p_in.stem + '.png'))

        # save wave
        f_low_sr = BYAcousticFeature(
            f0=f_low.f0,
            spectrogram=f_low.sp,
            aperiodicity=f_low.ap,
            mfcc=f_low.mc,
            voiced=f_low.voiced,
        )

        rate = acoustic_converter.out_sampling_rate
        wave = super_resolution(s_high,
                                acoustic_feature=f_low_sr,
                                sampling_rate=rate)
        librosa.output.write_wav(y=wave.wave,
                                 path=str(output / (p_in.stem + '.wav')),
                                 sr=rate)
    except:
        pass
Esempio n. 13
0

model_base_path = Path('./trained/').expanduser()
test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav')
test_output_path = Path('output.wav')
input_statistics_path = model_base_path / 'f0_statistics/hiho_f0stat.npy'
target_statistics_path = model_base_path / 'f0_statistics/yukari_f0stat.npy'

print('model loading...', flush=True)

f0_converter = F0Converter(input_statistics=input_statistics_path, target_statistics=target_statistics_path)

model_path = model_base_path / Path('pp-el8-wof0/predictor_2260000.npz')
config_path = model_base_path / Path('pp-el8-wof0/config.json')
config = create_config(config_path)
acoustic_converter = AcousticConverter(config, model_path, f0_converter=f0_converter)
print('model 1 loaded!', flush=True)

model_path = model_base_path / Path('sr-noise3/predictor_180000.npz')
config_path = model_base_path / Path('sr-noise3/config.json')
sr_config = create_sr_config(config_path)
super_resolution = SuperResolution(sr_config, model_path)
print('model 2 loaded!', flush=True)

audio_config = AudioConfig(
    rate=config.dataset.acoustic_param.sampling_rate,
    chunk=config.dataset.acoustic_param.sampling_rate,
    vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
    out_norm=4.5,
)
frame_period = config.dataset.acoustic_param.frame_period
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-odn', '--output_device_name')
    args = parser.parse_args()

    print('model loading...', flush=True)

    queue_input_wave = Queue()
    queue_input_feature = Queue()
    queue_output_feature = Queue()
    queue_output_wave = Queue()

    input_statistics_path = Path('./dat/out_1st_my_npy/')
    target_statistics_path = Path('./dat/out_1st_yukari_npy/')
    f0_converter = F0Converter(input_statistics=input_statistics_path,
                               target_statistics=target_statistics_path)
    # model_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/predictor_13840000.npz')
    # config_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/config.json')
    # f0_converter = AcousticConverter(create_config(config_path), model_path, gpu=0)

    model_path = Path(
        './trained/multi-16k-ref24k-el8-woD-gbc8/predictor_2910000.npz')
    config_path = Path('./trained/multi-16k-ref24k-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/predictor_5130000.npz')
    # config_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/predictor_5720000.npz')
    # config_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/predictor_5710000.npz')
    # config_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/config.json')
    config = create_config(config_path)
    acoustic_converter = AcousticConverter(
        config,
        model_path,
        gpu=0,
        f0_converter=f0_converter,
        out_sampling_rate=24000,
    )
    print('model 1 loaded!', flush=True)

    model_path = Path('./dat/model/yukari_2nd/predictor_120000.npz')
    config_path = Path('./dat/model/yukari_2nd/config.json')
    # model_path = Path('./trained/akane-super-resolution/predictor_240000.npz')
    # config_path = Path('./trained/akane-super-resolution/config.json')
    sr_config = create_sr_config(config_path)
    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
    print('model 2 loaded!', flush=True)

    audio_instance = pyaudio.PyAudio()
    audio_config = AudioConfig(
        in_rate=config.dataset.acoustic_param.sampling_rate,
        out_rate=24000,
        frame_period=config.dataset.acoustic_param.frame_period,
        in_audio_chunk=config.dataset.acoustic_param.sampling_rate,
        out_audio_chunk=24000,
        vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
        in_norm=1 / 8,
        out_norm=2.0,
        silent_threshold=-80.0,
    )

    conversion_flag = True

    voice_changer_stream = VoiceChangerStream(
        in_sampling_rate=audio_config.in_rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        order=config.dataset.acoustic_param.order,
        in_dtype=numpy.float32,
    )

    wrapper = VoiceChangerStreamWrapper(
        voice_changer_stream=voice_changer_stream,
        extra_time_pre=0.2,
        extra_time=0.5,
    )

    process_encoder = Process(target=encode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_input_wave,
                                  queue_output=queue_input_feature,
                              ))
    process_encoder.start()

    process_converter = Process(target=convert_worker,
                                kwargs=dict(
                                    config=config,
                                    wrapper=wrapper,
                                    acoustic_converter=acoustic_converter,
                                    super_resolution=super_resolution,
                                    audio_config=audio_config,
                                    queue_input=queue_input_feature,
                                    queue_output=queue_output_feature,
                                ))
    process_converter.start()

    process_decoder = Process(target=decode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_output_feature,
                                  queue_output=queue_output_wave,
                              ))
    process_decoder.start()

    # output device
    name = args.output_device_name
    if name is None:
        output_device_index = audio_instance.get_default_output_device_info(
        )['index']

    else:
        for i in range(audio_instance.get_device_count()):
            if name in str(audio_instance.get_device_info_by_index(i)['name']):
                output_device_index = i
                break
        else:
            print('device not found')
            exit(1)

    # audio stream
    print('output_device_index', output_device_index)
    audio_input_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.in_rate,
        frames_per_buffer=audio_config.in_audio_chunk,
        input=True,
    )

    audio_output_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.out_rate,
        frames_per_buffer=audio_config.out_audio_chunk,
        output=True,
        output_device_index=output_device_index,
    )

    # signal
    def signal_handler(*args, **kwargs):
        process_encoder.terminate()
        process_converter.terminate()
        process_decoder.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    # key event
    def key_handler(key):
        nonlocal conversion_flag
        if key == pynput.keyboard.Key.space:  # switch
            conversion_flag = not conversion_flag

    key_listener = pynput.keyboard.Listener(on_press=key_handler)
    key_listener.start()

    index_input = 0
    index_output = 0
    while True:
        # input audio
        in_data = audio_input_stream.read(audio_config.in_audio_chunk)
        wave = numpy.fromstring(in_data,
                                dtype=numpy.float32) * audio_config.in_norm

        item = Item(
            original=wave * 5,
            item=wave,
            index=index_input,
            conversion_flag=conversion_flag,
        )
        queue_input_wave.put(item)
        index_input += 1

        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
        print('queue_output_wave', queue_output_wave.qsize(), flush=True)

        # output
        wave: numpy.ndarray = None
        popped_list: List[Item] = []

        while True:
            try:
                while True:
                    item: Item = queue_output_wave.get_nowait()
                    popped_list.append(item)
            except queue.Empty:
                pass

            print('index_output', index_output)
            item = next(
                filter(lambda ii: ii.index == index_output, popped_list), None)
            if item is None:
                break

            popped_list.remove(item)

            index_output += 1
            if item.item is None:
                continue

            wave = item.item if item.conversion_flag else item.original
            break

        if wave is not None:
            wave *= audio_config.out_norm
            b = wave.astype(numpy.float32).tobytes()
            audio_output_stream.write(b)
Esempio n. 15
0
def process(p_in: Path, acoustic_converter: AcousticConverter,
            super_resolution: SuperResolution):
    try:
        if p_in.suffix in ['.npy', '.npz']:
            p_in = Path(
                glob.glob(str(dataset_input_wave_dir / p_in.stem) + '.*')[0])

        w_in = acoustic_converter.load_wave(p_in)
        f_in = acoustic_converter.extract_acoustic_feature(w_in)
        f_low = acoustic_converter.convert(f_in)
        # f_low = AcousticFeature(
        #     aperiodicity=f_low.aperiodicity,
        #     mfcc=f_low.mfcc,
        #     voiced=f_low.voiced,
        #     spectrogram=f_low.spectrogram,
        #     f0=scipy.ndimage.uniform_filter(f_low.f0, size=(5, 1)).astype(numpy.float32),
        # )
        s_high = super_resolution.convert(f_low.sp.astype(numpy.float32))

        # target
        p_true = Path(
            glob.glob(str(dataset_target_wave_dir / p_in.stem) + '.*')[0])
        w_true = acoustic_converter.load_wave(p_true)
        f_true = acoustic_converter.extract_acoustic_feature(w_true)

        # save figure
        fig = plt.figure(figsize=[18, 8])

        plt.subplot(4, 1, 1)
        plt.imshow(numpy.log(f_in.sp).T, aspect='auto', origin='reverse')
        plt.plot(f_in.f0, 'w')
        plt.colorbar()

        plt.subplot(4, 1, 2)
        plt.imshow(numpy.log(f_low.sp).T, aspect='auto', origin='reverse')
        plt.plot(f_low.f0, 'w')
        plt.colorbar()

        plt.subplot(4, 1, 3)
        plt.imshow(numpy.log(s_high).T, aspect='auto', origin='reverse')
        plt.colorbar()

        plt.subplot(4, 1, 4)
        plt.imshow(numpy.log(f_true.sp).T, aspect='auto', origin='reverse')
        plt.plot(f_true.f0, 'w')
        plt.colorbar()

        fig.savefig(output / (p_in.stem + '.png'))

        # save wave
        f_low_sr = BYAcousticFeature(
            f0=f_low.f0,
            spectrogram=f_low.sp,
            aperiodicity=f_low.ap,
            mfcc=f_low.mc,
            voiced=f_low.voiced,
        )

        rate = acoustic_converter.out_sampling_rate
        wave = super_resolution(s_high,
                                acoustic_feature=f_low_sr,
                                sampling_rate=rate)
        librosa.output.write_wav(y=wave.wave,
                                 path=str(output / (p_in.stem + '.wav')),
                                 sr=rate)
    except:
        import traceback
        print('error!', str(p_in))
        traceback.format_exc()
Esempio n. 16
0
def main():
    print('model loading...', flush=True)

    queue_input_wave = Queue()
    queue_input_feature = Queue()
    queue_output_feature = Queue()
    queue_output_wave = Queue()

    input_statistics_path = Path('./trained/f0_statistics/hiho_f0stat.npy')
    target_statistics_path = Path('./trained/f0_statistics/yukari_f0stat.npy')
    f0_converter = F0Converter(input_statistics=input_statistics_path,
                               target_statistics=target_statistics_path)

    model_path = Path('./trained/pp-el8-wof0/predictor_2260000.npz')
    config_path = Path('./trained/pp-el8-wof0/config.json')
    config = create_config(config_path)
    acoustic_converter = AcousticConverter(config,
                                           model_path,
                                           gpu=0,
                                           f0_converter=f0_converter)
    print('model 1 loaded!', flush=True)

    model_path = Path('./trained/sr-noise3/predictor_180000.npz')
    config_path = Path('./trained/sr-noise3/config.json')
    sr_config = create_sr_config(config_path)
    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
    print('model 2 loaded!', flush=True)

    audio_instance = pyaudio.PyAudio()
    audio_config = AudioConfig(
        rate=config.dataset.acoustic_param.sampling_rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        audio_chunk=config.dataset.acoustic_param.sampling_rate,
        convert_chunk=config.dataset.acoustic_param.sampling_rate,
        vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
        in_norm=1 / 8,
        out_norm=4.0,
        silent_threshold=-80.0,
    )

    voice_changer_stream = VoiceChangerStream(
        sampling_rate=audio_config.rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        order=config.dataset.acoustic_param.order,
        in_dtype=numpy.float32,
    )

    wrapper = VoiceChangerStreamWrapper(
        voice_changer_stream=voice_changer_stream,
        extra_time_pre=0.2,
        extra_time=0.5,
    )

    process_encoder = Process(target=encode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_input_wave,
                                  queue_output=queue_input_feature,
                              ))
    process_encoder.start()

    process_converter = Process(target=convert_worker,
                                kwargs=dict(
                                    config=config,
                                    wrapper=wrapper,
                                    acoustic_converter=acoustic_converter,
                                    super_resolution=super_resolution,
                                    audio_config=audio_config,
                                    queue_input=queue_input_feature,
                                    queue_output=queue_output_feature,
                                ))
    process_converter.start()

    process_decoder = Process(target=decode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_output_feature,
                                  queue_output=queue_output_wave,
                              ))
    process_decoder.start()

    audio_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.rate,
        frames_per_buffer=audio_config.audio_chunk,
        input=True,
        output=True,
    )

    while True:
        # input audio
        in_data = audio_stream.read(audio_config.audio_chunk)
        wave = numpy.fromstring(in_data,
                                dtype=numpy.float32) * audio_config.in_norm
        queue_input_wave.put(wave)

        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
        print('queue_output_wave', queue_output_wave.qsize(), flush=True)

        # output
        try:
            wave = queue_output_wave.get_nowait()
        except:
            wave = None

        if wave is not None:
            wave *= audio_config.out_norm
            b = wave.astype(numpy.float32).tobytes()
            audio_stream.write(b)
input_statistics_path = model_base_path / 'f0_statistics/hiho_f0stat.npy'
target_statistics_path = model_base_path / 'f0_statistics/yukari_f0stat.npy'

print('model loading...', flush=True)

f0_converter = F0Converter(input_statistics=input_statistics_path,
                           target_statistics=target_statistics_path)

model_path = Path(
    './trained/multi-16k-ref24k-el8-woD-gbc8/predictor_2910000.npz')
config_path = Path('./trained/multi-16k-ref24k-el8-woD-gbc8/config.json')
config = create_config(config_path)
acoustic_converter = AcousticConverter(
    config,
    model_path,
    gpu=0,
    f0_converter=f0_converter,
    out_sampling_rate=24000,
)
print('model 1 loaded!', flush=True)

model_path = model_base_path / Path('sr-noise3/predictor_180000.npz')
config_path = model_base_path / Path('sr-noise3/config.json')
sr_config = create_sr_config(config_path)
super_resolution = SuperResolution(sr_config, model_path)
print('model 2 loaded!', flush=True)

audio_config = AudioConfig(
    in_rate=config.dataset.acoustic_param.sampling_rate,
    out_rate=24000,
    chunk=config.dataset.acoustic_param.sampling_rate,