Esempio n. 1
0
def main():
    save_arguments(arguments, output / 'arguments.json')

    # f0 converter
    if f0_trans_model_dir is not None:
        model = _get_predictor_model_path(f0_trans_model_dir,
                                          f0_trans_model_iteration)
        f0_converter = AcousticConverter(create_config(f0_trans_config),
                                         model,
                                         gpu=gpu)
    elif input_statistics is not None:
        f0_converter = F0Converter(input_statistics=input_statistics,
                                   target_statistics=target_statistics)
    else:
        f0_converter = None

    # acoustic converter
    config = create_config(voice_changer_config)
    model = _get_predictor_model_path(voice_changer_model_dir,
                                      voice_changer_model_iteration)
    acoustic_converter = AcousticConverter(
        config,
        model,
        gpu=gpu,
        f0_converter=f0_converter,
        out_sampling_rate=arguments.out_sampling_rate,
    )
    print(f'Loaded acoustic converter model "{model}"')

    # super resolution
    sr_config = create_sr_config(super_resolution_config)
    super_resolution = SuperResolution(sr_config,
                                       super_resolution_model,
                                       gpu=gpu)
    print(f'Loaded super resolution model "{super_resolution_model}"')

    # dataset's test
    if not disable_dataset_test:
        input_paths = list(
            sorted(
                [Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
        numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
        paths_test = input_paths[-config.dataset.num_test:]
    else:
        paths_test = []

    # test data
    if test_wave_dir is not None:
        paths_test += list(test_wave_dir.glob('*.wav'))

    process_partial = partial(process,
                              acoustic_converter=acoustic_converter,
                              super_resolution=super_resolution)
    if gpu is None:
        list(multiprocessing.Pool().map(process_partial, paths_test))
    else:
        list(map(process_partial, paths_test))
Esempio n. 2
0
def main():
    if arguments.voice_changer_model_iteration is None:
        paths = voice_changer_model_dir.glob('predictor_*.npz')
        voice_changer_model = list(sorted(paths, key=_extract_number))[-1]
    else:
        voice_changer_model = voice_changer_model_dir / 'predictor_{}.npz'.format(
            arguments.voice_changer_model_iteration)

    config = create_config(arguments.voice_changer_config)
    acoustic_converter = AcousticConverter(config,
                                           voice_changer_model,
                                           gpu=arguments.gpu)

    sr_config = create_sr_config(arguments.super_resolution_config)
    super_resolution = SuperResolution(sr_config,
                                       super_resolution_model,
                                       gpu=arguments.gpu)

    # test data
    input_paths = list(
        sorted([Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
    numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
    paths_test = input_paths[-config.dataset.num_test:]

    process_partial = partial(process,
                              acoustic_converter=acoustic_converter,
                              super_resolution=super_resolution)
    if gpu is None:
        pool = multiprocessing.Pool()
        pool.map(process_partial, paths_test)
    else:
        list(map(process_partial, paths_test))
Esempio n. 3
0
    def __init__(
        self,
        voice_changer_model: Path,
        voice_changer_config: Path,
        super_resolution_model: Path,
        super_resolution_config: Path,
        input_statistics: Path,
        target_statistics: Path,
        gpu: int,
    ):
        # f0 converter
        if input_statistics is not None:
            f0_converter = F0Converter(input_statistics=input_statistics,
                                       target_statistics=target_statistics)
        else:
            f0_converter = None

        # acoustic converter
        config = create_config(voice_changer_config)
        acoustic_converter = AcousticConverter(
            config,
            voice_changer_model,
            gpu=gpu,
            f0_converter=f0_converter,
        )

        # super resolution
        sr_config = create_sr_config(super_resolution_config)
        super_resolution = SuperResolution(sr_config,
                                           super_resolution_model,
                                           gpu=gpu)

        self.acoustic_converter = acoustic_converter
        self.super_resolution = super_resolution
Esempio n. 4
0
def main():
    arguments.output.mkdir(exist_ok=True)
    save_arguments(arguments, arguments.output / 'arguments.json')

    config = create_config(arguments.vc_config)
    acoustic_converter = AcousticConverter(config,
                                           arguments.vc_model,
                                           gpu=arguments.gpu)

    paths = [Path(p) for p in glob.glob(arguments.input_glob)]

    pool = multiprocessing.Pool()
    it = pool.imap(
        partial(convert_feature, acoustic_converter=acoustic_converter), paths)
    list(tqdm.tqdm(it, total=len(paths)))
Esempio n. 5
0
def main():
    save_arguments(arguments, output_dir / 'arguments.json')

    # f0 converter
    if input_statistics is not None:
        f0_converter = F0Converter(input_statistics=input_statistics,
                                   target_statistics=target_statistics)
    else:
        f0_converter = None

    # acoustic converter
    config = create_config(config_path)
    model = _get_predictor_model_path(model_dir, model_iteration)
    acoustic_converter = AcousticConverter(
        config,
        model,
        gpu=gpu,
        f0_converter=f0_converter,
        out_sampling_rate=output_sampling_rate,
    )
    print(f'Loaded acoustic converter model "{model}"')

    # dataset test
    if not disable_dataset_test:
        input_paths = list(
            sorted(
                [Path(p) for p in glob.glob(str(config.dataset.input_glob))]))
        numpy.random.RandomState(config.dataset.seed).shuffle(input_paths)
        paths_test = input_paths[-config.dataset.num_test:]
    else:
        paths_test = []

    # additional test
    if test_wave_dir is not None:
        paths_test += list(test_wave_dir.glob('*.wav'))

    process_partial = partial(process, acoustic_converter=acoustic_converter)
    if gpu is None:
        list(multiprocessing.Pool().map(process_partial, paths_test))
    else:
        list(map(process_partial, paths_test))
Esempio n. 6
0
    def make_yukarin_converter(
        input_statistics_path: Path,
        target_statistics_path: Path,
        stage1_model_path: Path,
        stage1_config_path: Path,
        stage2_model_path: Path,
        stage2_config_path: Path,
    ):
        logger = logging.getLogger('encode')
        init_logger(logger)
        logger.info('make_yukarin_converter')

        f0_converter = F0Converter(
            input_statistics=input_statistics_path,
            target_statistics=target_statistics_path,
        )

        config = create_config(stage1_config_path)
        acoustic_converter = AcousticConverter(
            config=config,
            model_path=stage1_model_path,
            gpu=0,
            f0_converter=f0_converter,
            out_sampling_rate=24000,
        )
        logger.info('model 1 loaded!')

        sr_config = create_sr_config(stage2_config_path)
        super_resolution = SuperResolution(
            config=sr_config,
            model_path=stage2_model_path,
            gpu=0,
        )
        logger.info('model 2 loaded!')
        return YukarinConverter(
            acoustic_converter=acoustic_converter,
            super_resolution=super_resolution,
        )
Esempio n. 7
0
def main():
    print('model loading...', flush=True)

    queue_input_wave = Queue()
    queue_input_feature = Queue()
    queue_output_feature = Queue()
    queue_output_wave = Queue()

    input_statistics_path = Path('./trained/f0_statistics/hiho_f0stat.npy')
    target_statistics_path = Path('./trained/f0_statistics/yukari_f0stat.npy')
    f0_converter = F0Converter(input_statistics=input_statistics_path,
                               target_statistics=target_statistics_path)

    model_path = Path('./trained/pp-el8-wof0/predictor_2260000.npz')
    config_path = Path('./trained/pp-el8-wof0/config.json')
    config = create_config(config_path)
    acoustic_converter = AcousticConverter(config,
                                           model_path,
                                           gpu=0,
                                           f0_converter=f0_converter)
    print('model 1 loaded!', flush=True)

    model_path = Path('./trained/sr-noise3/predictor_180000.npz')
    config_path = Path('./trained/sr-noise3/config.json')
    sr_config = create_sr_config(config_path)
    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
    print('model 2 loaded!', flush=True)

    audio_instance = pyaudio.PyAudio()
    audio_config = AudioConfig(
        rate=config.dataset.acoustic_param.sampling_rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        audio_chunk=config.dataset.acoustic_param.sampling_rate,
        convert_chunk=config.dataset.acoustic_param.sampling_rate,
        vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
        in_norm=1 / 8,
        out_norm=4.0,
        silent_threshold=-80.0,
    )

    voice_changer_stream = VoiceChangerStream(
        sampling_rate=audio_config.rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        order=config.dataset.acoustic_param.order,
        in_dtype=numpy.float32,
    )

    wrapper = VoiceChangerStreamWrapper(
        voice_changer_stream=voice_changer_stream,
        extra_time_pre=0.2,
        extra_time=0.5,
    )

    process_encoder = Process(target=encode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_input_wave,
                                  queue_output=queue_input_feature,
                              ))
    process_encoder.start()

    process_converter = Process(target=convert_worker,
                                kwargs=dict(
                                    config=config,
                                    wrapper=wrapper,
                                    acoustic_converter=acoustic_converter,
                                    super_resolution=super_resolution,
                                    audio_config=audio_config,
                                    queue_input=queue_input_feature,
                                    queue_output=queue_output_feature,
                                ))
    process_converter.start()

    process_decoder = Process(target=decode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_output_feature,
                                  queue_output=queue_output_wave,
                              ))
    process_decoder.start()

    audio_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.rate,
        frames_per_buffer=audio_config.audio_chunk,
        input=True,
        output=True,
    )

    while True:
        # input audio
        in_data = audio_stream.read(audio_config.audio_chunk)
        wave = numpy.fromstring(in_data,
                                dtype=numpy.float32) * audio_config.in_norm
        queue_input_wave.put(wave)

        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
        print('queue_output_wave', queue_output_wave.qsize(), flush=True)

        # output
        try:
            wave = queue_output_wave.get_nowait()
        except:
            wave = None

        if wave is not None:
            wave *= audio_config.out_norm
            b = wave.astype(numpy.float32).tobytes()
            audio_stream.write(b)
Esempio n. 8
0
def check(
    input_path: Path,
    input_time_length: int,
    output_path: Path,
    input_statistics_path: Path,
    target_statistics_path: Path,
    stage1_model_path: Path,
    stage1_config_path: Path,
    stage2_model_path: Path,
    stage2_config_path: Path,
):
    ac_config = create_config(stage1_config_path)
    sr_config = create_sr_config(stage2_config_path)
    input_rate = ac_config.dataset.acoustic_param.sampling_rate
    output_rate = sr_config.dataset.param.voice_param.sample_rate

    realtime_vocoder = RealtimeVocoder(
        acoustic_param=ac_config.dataset.acoustic_param,
        out_sampling_rate=output_rate,
        extract_f0_mode=VocodeMode.WORLD,
    )
    realtime_vocoder.create_synthesizer(
        buffer_size=1024,
        number_of_pointers=16,
    )

    f0_converter = F0Converter(
        input_statistics=input_statistics_path,
        target_statistics=target_statistics_path,
    )

    ac_config = ac_config
    sr_config = sr_config

    acoustic_converter = AcousticConverter(
        ac_config,
        stage1_model_path,
        f0_converter=f0_converter,
        out_sampling_rate=output_rate,
    )
    super_resolution = SuperResolution(
        sr_config,
        stage2_model_path,
    )

    voice_changer = VoiceChanger(
        acoustic_converter=acoustic_converter,
        super_resolution=super_resolution,
        output_sampling_rate=output_rate,
    )

    encode_stream = EncodeStream(vocoder=realtime_vocoder)
    convert_stream = ConvertStream(voice_changer=voice_changer)
    decode_stream = DecodeStream(vocoder=realtime_vocoder)

    num_data = input_time_length
    time_length = 1

    def _load_wave_and_split(time_length: float = 1):
        length = round(time_length * input_rate)
        wave, _ = librosa.load(str(input_path), sr=input_rate)
        return [
            wave[i * length:(i + 1) * length]
            for i in range(len(wave) // length)
        ]

    def _add(_stream: BaseStream, _datas):
        for i, data in zip(range(num_data), _datas):
            _stream.add(start_time=i * time_length, data=data)

    def _split_process(_stream: BaseStream, _extra_time: float):
        return [
            _stream.process(start_time=i * time_length,
                            time_length=time_length,
                            extra_time=_extra_time) for i in range(num_data)
        ]

    def _join_process(_stream: BaseStream, _extra_time: float):
        return _stream.process(start_time=0,
                               time_length=time_length * num_data,
                               extra_time=_extra_time)

    def _process_all_stream(
        _streams: Tuple[BaseStream, BaseStream, BaseStream],
        _datas,
        _split_flags: Tuple[bool, bool, bool],
        _extra_times: Tuple[float, float, float],
    ):
        for stream, split_flag, extra_time in zip(_streams, _split_flags,
                                                  _extra_times):
            _add(stream, _datas)
            if split_flag:
                _datas = _split_process(stream, _extra_time=extra_time)
            else:
                _datas = [_join_process(stream, _extra_time=extra_time)]
        return _datas

    def _concat_and_save(_waves, _path: Path):
        wave = numpy.concatenate(_waves).astype(numpy.float32)
        librosa.output.write_wav(str(_path), wave, output_rate)

    def _remove(_streams: Tuple[BaseStream, BaseStream, BaseStream]):
        for stream in _streams:
            stream.remove(end_time=num_data)

    waves = _load_wave_and_split(time_length=time_length)[:num_data]
    encode_stream = encode_stream
    convert_stream = convert_stream
    decode_stream = decode_stream

    streams = (encode_stream, convert_stream, decode_stream)

    datas = _process_all_stream(streams,
                                waves,
                                _split_flags=(True, True, True),
                                _extra_times=(0, 1, 0))
    _concat_and_save(datas, output_path)
    _remove(streams)
Esempio n. 9
0
    out_norm: float


model_base_path = Path('./trained/').expanduser()
test_data_path = Path('tests/test-deep-learning-yuduki-yukari.wav')
test_output_path = Path('output.wav')
input_statistics_path = model_base_path / 'f0_statistics/hiho_f0stat.npy'
target_statistics_path = model_base_path / 'f0_statistics/yukari_f0stat.npy'

print('model loading...', flush=True)

f0_converter = F0Converter(input_statistics=input_statistics_path, target_statistics=target_statistics_path)

model_path = model_base_path / Path('pp-el8-wof0/predictor_2260000.npz')
config_path = model_base_path / Path('pp-el8-wof0/config.json')
config = create_config(config_path)
acoustic_converter = AcousticConverter(config, model_path, f0_converter=f0_converter)
print('model 1 loaded!', flush=True)

model_path = model_base_path / Path('sr-noise3/predictor_180000.npz')
config_path = model_base_path / Path('sr-noise3/config.json')
sr_config = create_sr_config(config_path)
super_resolution = SuperResolution(sr_config, model_path)
print('model 2 loaded!', flush=True)

audio_config = AudioConfig(
    rate=config.dataset.acoustic_param.sampling_rate,
    chunk=config.dataset.acoustic_param.sampling_rate,
    vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
    out_norm=4.5,
)
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-odn', '--output_device_name')
    args = parser.parse_args()

    print('model loading...', flush=True)

    queue_input_wave = Queue()
    queue_input_feature = Queue()
    queue_output_feature = Queue()
    queue_output_wave = Queue()

    input_statistics_path = Path('./dat/out_1st_my_npy/')
    target_statistics_path = Path('./dat/out_1st_yukari_npy/')
    f0_converter = F0Converter(input_statistics=input_statistics_path,
                               target_statistics=target_statistics_path)
    # model_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/predictor_13840000.npz')
    # config_path = Path('./trained/f0trans-wmc-multi-ref-el8-woD/config.json')
    # f0_converter = AcousticConverter(create_config(config_path), model_path, gpu=0)

    model_path = Path(
        './trained/multi-16k-ref24k-el8-woD-gbc8/predictor_2910000.npz')
    config_path = Path('./trained/multi-16k-ref24k-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/predictor_5130000.npz')
    # config_path = Path('./trained/akane-multi-ref-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/predictor_5720000.npz')
    # config_path = Path('./trained/aoi-multi-ref-el8-woD-gbc8/config.json')
    # model_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/predictor_5710000.npz')
    # config_path = Path('./trained/zunko-multi-ref-el8-woD-gbc8/config.json')
    config = create_config(config_path)
    acoustic_converter = AcousticConverter(
        config,
        model_path,
        gpu=0,
        f0_converter=f0_converter,
        out_sampling_rate=24000,
    )
    print('model 1 loaded!', flush=True)

    model_path = Path('./dat/model/yukari_2nd/predictor_120000.npz')
    config_path = Path('./dat/model/yukari_2nd/config.json')
    # model_path = Path('./trained/akane-super-resolution/predictor_240000.npz')
    # config_path = Path('./trained/akane-super-resolution/config.json')
    sr_config = create_sr_config(config_path)
    super_resolution = SuperResolution(sr_config, model_path, gpu=0)
    print('model 2 loaded!', flush=True)

    audio_instance = pyaudio.PyAudio()
    audio_config = AudioConfig(
        in_rate=config.dataset.acoustic_param.sampling_rate,
        out_rate=24000,
        frame_period=config.dataset.acoustic_param.frame_period,
        in_audio_chunk=config.dataset.acoustic_param.sampling_rate,
        out_audio_chunk=24000,
        vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
        in_norm=1 / 8,
        out_norm=2.0,
        silent_threshold=-80.0,
    )

    conversion_flag = True

    voice_changer_stream = VoiceChangerStream(
        in_sampling_rate=audio_config.in_rate,
        frame_period=config.dataset.acoustic_param.frame_period,
        order=config.dataset.acoustic_param.order,
        in_dtype=numpy.float32,
    )

    wrapper = VoiceChangerStreamWrapper(
        voice_changer_stream=voice_changer_stream,
        extra_time_pre=0.2,
        extra_time=0.5,
    )

    process_encoder = Process(target=encode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_input_wave,
                                  queue_output=queue_input_feature,
                              ))
    process_encoder.start()

    process_converter = Process(target=convert_worker,
                                kwargs=dict(
                                    config=config,
                                    wrapper=wrapper,
                                    acoustic_converter=acoustic_converter,
                                    super_resolution=super_resolution,
                                    audio_config=audio_config,
                                    queue_input=queue_input_feature,
                                    queue_output=queue_output_feature,
                                ))
    process_converter.start()

    process_decoder = Process(target=decode_worker,
                              kwargs=dict(
                                  config=config,
                                  wrapper=wrapper,
                                  audio_config=audio_config,
                                  queue_input=queue_output_feature,
                                  queue_output=queue_output_wave,
                              ))
    process_decoder.start()

    # output device
    name = args.output_device_name
    if name is None:
        output_device_index = audio_instance.get_default_output_device_info(
        )['index']

    else:
        for i in range(audio_instance.get_device_count()):
            if name in str(audio_instance.get_device_info_by_index(i)['name']):
                output_device_index = i
                break
        else:
            print('device not found')
            exit(1)

    # audio stream
    print('output_device_index', output_device_index)
    audio_input_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.in_rate,
        frames_per_buffer=audio_config.in_audio_chunk,
        input=True,
    )

    audio_output_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=audio_config.out_rate,
        frames_per_buffer=audio_config.out_audio_chunk,
        output=True,
        output_device_index=output_device_index,
    )

    # signal
    def signal_handler(*args, **kwargs):
        process_encoder.terminate()
        process_converter.terminate()
        process_decoder.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    # key event
    def key_handler(key):
        nonlocal conversion_flag
        if key == pynput.keyboard.Key.space:  # switch
            conversion_flag = not conversion_flag

    key_listener = pynput.keyboard.Listener(on_press=key_handler)
    key_listener.start()

    index_input = 0
    index_output = 0
    while True:
        # input audio
        in_data = audio_input_stream.read(audio_config.in_audio_chunk)
        wave = numpy.fromstring(in_data,
                                dtype=numpy.float32) * audio_config.in_norm

        item = Item(
            original=wave * 5,
            item=wave,
            index=index_input,
            conversion_flag=conversion_flag,
        )
        queue_input_wave.put(item)
        index_input += 1

        print('queue_input_wave', queue_input_wave.qsize(), flush=True)
        print('queue_input_feature', queue_input_feature.qsize(), flush=True)
        print('queue_output_feature', queue_output_feature.qsize(), flush=True)
        print('queue_output_wave', queue_output_wave.qsize(), flush=True)

        # output
        wave: numpy.ndarray = None
        popped_list: List[Item] = []

        while True:
            try:
                while True:
                    item: Item = queue_output_wave.get_nowait()
                    popped_list.append(item)
            except queue.Empty:
                pass

            print('index_output', index_output)
            item = next(
                filter(lambda ii: ii.index == index_output, popped_list), None)
            if item is None:
                break

            popped_list.remove(item)

            index_output += 1
            if item.item is None:
                continue

            wave = item.item if item.conversion_flag else item.original
            break

        if wave is not None:
            wave *= audio_config.out_norm
            b = wave.astype(numpy.float32).tobytes()
            audio_output_stream.write(b)
Esempio n. 11
0
 def ac_config(self):
     if self._ac_config is None:
         self._ac_config = create_config(self.stage1_config_path)
     return self._ac_config