Ejemplo n.º 1
0
def decode_worker(
    realtime_vocoder: RealtimeVocoder,
    time_length: float,
    extra_time: float,
    vocoder_buffer_size: int,
    out_audio_chunk: int,
    output_silent_threshold: float,
    queue_input: Queue,
    queue_output: Queue,
    acquired_lock: Lock,
):
    logger = logging.getLogger('decode')
    init_logger(logger)
    logging.info('decode worker')

    realtime_vocoder.create_synthesizer(
        buffer_size=vocoder_buffer_size,
        number_of_pointers=16,
    )
    stream = DecodeStream(vocoder=realtime_vocoder)
    stream_wrapper = StreamWrapper(stream=stream, extra_time=extra_time)

    acquired_lock.release()
    start_time = extra_time
    wave_fragment = numpy.empty(0)
    while True:
        item: Item = queue_input.get()
        start = time.time()
        feature: AcousticFeature = item.item
        stream.add(
            start_time=start_time,
            data=feature,
        )
        start_time += time_length

        wave = stream_wrapper.process_next(time_length=time_length)

        wave_fragment = numpy.concatenate([wave_fragment, wave])
        if len(wave_fragment) >= out_audio_chunk:
            wave, wave_fragment = wave_fragment[:
                                                out_audio_chunk], wave_fragment[
                                                    out_audio_chunk:]

            power = librosa.core.power_to_db(numpy.abs(
                librosa.stft(wave))**2).mean()
            if power < -output_silent_threshold:
                wave = None  # pass
        else:
            wave = None

        item.item = wave
        queue_output.put(item)

        logger.debug(f'{item.index}: {time.time() - start}')
Ejemplo n.º 2
0
 def realtime_vocoder(self):
     if self._realtime_vocoder is None:
         self._realtime_vocoder = RealtimeVocoder(
             acoustic_param=self.ac_config.dataset.acoustic_param,
             out_sampling_rate=self.out_sampling_rate,
             extract_f0_mode=VocodeMode.WORLD,
         )
         self._realtime_vocoder.create_synthesizer(
             buffer_size=1024,
             number_of_pointers=16,
         )
     return self._realtime_vocoder
Ejemplo n.º 3
0
def decode_worker(
    config: Config,
    wrapper: VoiceChangerStreamWrapper,
    audio_config: AudioConfig,
    queue_input: Queue,
    queue_output: Queue,
):
    wrapper.voice_changer_stream.vocoder = RealtimeVocoder(
        acoustic_param=config.dataset.acoustic_param,
        out_sampling_rate=audio_config.out_rate,
        buffer_size=audio_config.vocoder_buffer_size,
        number_of_pointers=16,
    )

    start_time = 0
    time_length = audio_config.out_audio_chunk / audio_config.out_rate
    wave_fragment = numpy.empty(0)
    while True:
        item: Item = queue_input.get()
        feature: AcousticFeature = item.item
        wrapper.voice_changer_stream.add_out_feature(
            start_time=start_time,
            feature=feature,
            frame_period=audio_config.frame_period,
        )
        start_time += time_length

        wave = wrapper.post_convert_next(time_length=time_length).wave

        wave_fragment = numpy.concatenate([wave_fragment, wave])
        if len(wave_fragment) >= audio_config.out_audio_chunk:
            wave, wave_fragment = wave_fragment[:audio_config.
                                                out_audio_chunk], wave_fragment[
                                                    audio_config.
                                                    out_audio_chunk:]

            power = librosa.core.power_to_db(numpy.abs(
                librosa.stft(wave))**2).mean()
            if power < audio_config.silent_threshold:
                wave = None  # pass
        else:
            wave = None

        item.item = wave
        queue_output.put(item)
Ejemplo n.º 4
0
def check(
    input_path: Path,
    input_time_length: int,
    output_path: Path,
    input_statistics_path: Path,
    target_statistics_path: Path,
    stage1_model_path: Path,
    stage1_config_path: Path,
    stage2_model_path: Path,
    stage2_config_path: Path,
):
    ac_config = create_config(stage1_config_path)
    sr_config = create_sr_config(stage2_config_path)
    input_rate = ac_config.dataset.acoustic_param.sampling_rate
    output_rate = sr_config.dataset.param.voice_param.sample_rate

    realtime_vocoder = RealtimeVocoder(
        acoustic_param=ac_config.dataset.acoustic_param,
        out_sampling_rate=output_rate,
        extract_f0_mode=VocodeMode.WORLD,
    )
    realtime_vocoder.create_synthesizer(
        buffer_size=1024,
        number_of_pointers=16,
    )

    f0_converter = F0Converter(
        input_statistics=input_statistics_path,
        target_statistics=target_statistics_path,
    )

    ac_config = ac_config
    sr_config = sr_config

    acoustic_converter = AcousticConverter(
        ac_config,
        stage1_model_path,
        f0_converter=f0_converter,
        out_sampling_rate=output_rate,
    )
    super_resolution = SuperResolution(
        sr_config,
        stage2_model_path,
    )

    voice_changer = VoiceChanger(
        acoustic_converter=acoustic_converter,
        super_resolution=super_resolution,
        output_sampling_rate=output_rate,
    )

    encode_stream = EncodeStream(vocoder=realtime_vocoder)
    convert_stream = ConvertStream(voice_changer=voice_changer)
    decode_stream = DecodeStream(vocoder=realtime_vocoder)

    num_data = input_time_length
    time_length = 1

    def _load_wave_and_split(time_length: float = 1):
        length = round(time_length * input_rate)
        wave, _ = librosa.load(str(input_path), sr=input_rate)
        return [
            wave[i * length:(i + 1) * length]
            for i in range(len(wave) // length)
        ]

    def _add(_stream: BaseStream, _datas):
        for i, data in zip(range(num_data), _datas):
            _stream.add(start_time=i * time_length, data=data)

    def _split_process(_stream: BaseStream, _extra_time: float):
        return [
            _stream.process(start_time=i * time_length,
                            time_length=time_length,
                            extra_time=_extra_time) for i in range(num_data)
        ]

    def _join_process(_stream: BaseStream, _extra_time: float):
        return _stream.process(start_time=0,
                               time_length=time_length * num_data,
                               extra_time=_extra_time)

    def _process_all_stream(
        _streams: Tuple[BaseStream, BaseStream, BaseStream],
        _datas,
        _split_flags: Tuple[bool, bool, bool],
        _extra_times: Tuple[float, float, float],
    ):
        for stream, split_flag, extra_time in zip(_streams, _split_flags,
                                                  _extra_times):
            _add(stream, _datas)
            if split_flag:
                _datas = _split_process(stream, _extra_time=extra_time)
            else:
                _datas = [_join_process(stream, _extra_time=extra_time)]
        return _datas

    def _concat_and_save(_waves, _path: Path):
        wave = numpy.concatenate(_waves).astype(numpy.float32)
        librosa.output.write_wav(str(_path), wave, output_rate)

    def _remove(_streams: Tuple[BaseStream, BaseStream, BaseStream]):
        for stream in _streams:
            stream.remove(end_time=num_data)

    waves = _load_wave_and_split(time_length=time_length)[:num_data]
    encode_stream = encode_stream
    convert_stream = convert_stream
    decode_stream = decode_stream

    streams = (encode_stream, convert_stream, decode_stream)

    datas = _process_all_stream(streams,
                                waves,
                                _split_flags=(True, True, True),
                                _extra_times=(0, 1, 0))
    _concat_and_save(datas, output_path)
    _remove(streams)
Ejemplo n.º 5
0
config_path = model_base_path / Path('sr-noise3/config.json')
sr_config = create_sr_config(config_path)
super_resolution = SuperResolution(sr_config, model_path)
print('model 2 loaded!', flush=True)

audio_config = AudioConfig(
    rate=config.dataset.acoustic_param.sampling_rate,
    chunk=config.dataset.acoustic_param.sampling_rate,
    vocoder_buffer_size=config.dataset.acoustic_param.sampling_rate // 16,
    out_norm=4.5,
)
frame_period = config.dataset.acoustic_param.frame_period

vocoder = RealtimeVocoder(
    acoustic_param=config.dataset.acoustic_param,
    out_sampling_rate=audio_config.rate,
    buffer_size=audio_config.vocoder_buffer_size,
    number_of_pointers=16,
)

voice_changer = VoiceChanger(
    super_resolution=super_resolution,
    acoustic_converter=acoustic_converter,
)

voice_changer_stream = VoiceChangerStream(
    sampling_rate=audio_config.rate,
    frame_period=acoustic_converter._param.frame_period,
    order=acoustic_converter._param.order,
    in_dtype=numpy.float32,
)
Ejemplo n.º 6
0
Archivo: run.py Proyecto: RaiJPch/test
def run(config_path: Path, ):
    logger = logging.getLogger('root')
    init_logger(logger)

    logger.info('model loading...')

    config = Config.from_yaml(config_path)

    converter = YukarinConverter.make_yukarin_converter(
        input_statistics_path=config.input_statistics_path,
        target_statistics_path=config.target_statistics_path,
        stage1_model_path=config.stage1_model_path,
        stage1_config_path=config.stage1_config_path,
        stage2_model_path=config.stage2_model_path,
        stage2_config_path=config.stage2_config_path,
    )

    realtime_vocoder = RealtimeVocoder(
        acoustic_param=converter.acoustic_converter.config.dataset.
        acoustic_param,
        out_sampling_rate=config.output_rate,
        extract_f0_mode=config.extract_f0_mode,
    )

    audio_instance = pyaudio.PyAudio()

    queue_input_wave: Queue[Item] = Queue()
    queue_input_feature: Queue[Item] = Queue()
    queue_output_feature: Queue[Item] = Queue()
    queue_output_wave: Queue[Item] = Queue()

    lock_encoder = Lock()
    lock_converter = Lock()
    lock_decoder = Lock()

    lock_encoder.acquire()
    process_encoder = Process(target=encode_worker,
                              kwargs=dict(
                                  realtime_vocoder=realtime_vocoder,
                                  time_length=config.buffer_time,
                                  extra_time=config.encode_extra_time,
                                  queue_input=queue_input_wave,
                                  queue_output=queue_input_feature,
                                  acquired_lock=lock_encoder,
                              ))
    process_encoder.start()

    lock_converter.acquire()
    process_converter = Process(
        target=convert_worker,
        kwargs=dict(
            acoustic_converter=converter.acoustic_converter,
            super_resolution=converter.super_resolution,
            time_length=config.buffer_time,
            extra_time=config.convert_extra_time,
            input_silent_threshold=config.input_silent_threshold,
            queue_input=queue_input_feature,
            queue_output=queue_output_feature,
            acquired_lock=lock_converter,
        ))
    process_converter.start()

    lock_decoder.acquire()
    process_decoder = Process(
        target=decode_worker,
        kwargs=dict(
            realtime_vocoder=realtime_vocoder,
            time_length=config.buffer_time,
            extra_time=config.decode_extra_time,
            vocoder_buffer_size=config.vocoder_buffer_size,
            out_audio_chunk=config.out_audio_chunk,
            output_silent_threshold=config.output_silent_threshold,
            queue_input=queue_output_feature,
            queue_output=queue_output_wave,
            acquired_lock=lock_decoder,
        ))
    process_decoder.start()

    with lock_encoder, lock_converter, lock_decoder:
        pass  # wait

    # input device
    if config.input_device_name is None:
        input_device_index = audio_instance.get_default_input_device_info(
        )['index']

    else:
        for i in range(audio_instance.get_device_count()):
            if config.input_device_name in str(
                    audio_instance.get_device_info_by_index(i)['name']):
                input_device_index = i
                break
        else:
            raise ValueError('input device not found')

    # output device
    if config.output_device_name is None:
        output_device_index = audio_instance.get_default_output_device_info(
        )['index']

    else:
        for i in range(audio_instance.get_device_count()):
            if config.output_device_name in str(
                    audio_instance.get_device_info_by_index(i)['name']):
                output_device_index = i
                break
        else:
            raise ValueError('output device not found')

    # audio stream
    audio_input_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=config.input_rate,
        frames_per_buffer=config.in_audio_chunk,
        input=True,
        input_device_index=input_device_index,
    )

    audio_output_stream = audio_instance.open(
        format=pyaudio.paFloat32,
        channels=1,
        rate=config.output_rate,
        frames_per_buffer=config.out_audio_chunk,
        output=True,
        output_device_index=output_device_index,
    )

    # signal
    def signal_handler(s, f):
        process_encoder.terminate()
        process_converter.terminate()
        process_decoder.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    logger.debug('audio loop')

    index_input = 0
    index_output = 0
    popped_list: List[Item] = []
    while True:
        # input audio
        in_data = audio_input_stream.read(config.in_audio_chunk)
        in_wave = numpy.frombuffer(in_data,
                                   dtype=numpy.float32) * config.input_scale

        in_item = Item(
            item=in_wave,
            index=index_input,
        )
        queue_input_wave.put(in_item)

        logger.debug(f'input {index_input}')
        index_input += 1

        # output
        out_wave: Optional[numpy.ndarray] = None
        while True:
            try:
                while True:  # get all item in queue, for "cut in line"
                    item: Item = queue_output_wave.get_nowait()
                    popped_list.append(item)
            except queue.Empty:
                pass

            out_item = next(
                filter(lambda ii: ii.index == index_output, popped_list), None)
            if out_item is None:
                break

            popped_list.remove(out_item)

            logger.debug(f'output {index_output}')
            index_output += 1

            out_wave = out_item.item
            if out_wave is None:  # silence wave
                continue

            break

        if out_wave is None:
            out_wave = numpy.zeros(config.out_audio_chunk)
        out_wave *= config.output_scale

        b = out_wave[:config.out_audio_chunk].astype(numpy.float32).tobytes()
        audio_output_stream.write(b)
Ejemplo n.º 7
0
class AllStreamTest(TestCase):
    def setUp(self):
        self.input_statistics_path = os.getenv('INPUT_STATISTICS')
        self.target_statistics_path = os.getenv('TARGET_STATISTICS')
        self.stage1_model_path = os.getenv('ACOUSTIC_CONVERT_MODEL')
        self.stage1_config_path = os.getenv('ACOUSTIC_CONVERT_CONFIG')
        self.stage2_model_path = os.getenv('SUPER_RESOLUTION_MODEL')
        self.stage2_config_path = os.getenv('SUPER_RESOLUTION_CONFIG')

        if self.input_statistics_path is None:
            raise ValueError('INPUT_STATISTICS is not found.')
        if self.target_statistics_path is None:
            raise ValueError('TARGET_STATISTICS is not found.')
        if self.stage1_model_path is None:
            raise ValueError('ACOUSTIC_CONVERT_MODEL is not found.')
        if self.stage1_config_path is None:
            raise ValueError('ACOUSTIC_CONVERT_CONFIG is not found.')
        if self.stage2_model_path is None:
            raise ValueError('SUPER_RESOLUTION_MODEL is not found.')
        if self.stage2_config_path is None:
            raise ValueError('SUPER_RESOLUTION_CONFIG is not found.')

        self._ac_config = None
        self._sr_config = None
        self._input_rate = None
        self._out_sampling_rate = None
        self._vocoder = None
        self._realtime_vocoder = None
        self._models = None
        self._voice_changer = None
        self._encode_stream = None
        self._convert_stream = None
        self._decode_stream = None

    @property
    def ac_config(self):
        if self._ac_config is None:
            self._ac_config = create_config(self.stage1_config_path)
        return self._ac_config

    @property
    def sr_config(self):
        if self._sr_config is None:
            self._sr_config = create_sr_config(self.stage2_config_path)
        return self._sr_config

    @property
    def input_rate(self):
        if self._input_rate is None:
            self._input_rate = self.ac_config.dataset.acoustic_param.sampling_rate
        return self._input_rate

    @property
    def out_sampling_rate(self):
        if self._out_sampling_rate is None:
            self._out_sampling_rate = self.sr_config.dataset.param.voice_param.sample_rate
        return self._out_sampling_rate

    @property
    def realtime_vocoder(self):
        if self._realtime_vocoder is None:
            self._realtime_vocoder = RealtimeVocoder(
                acoustic_param=self.ac_config.dataset.acoustic_param,
                out_sampling_rate=self.out_sampling_rate,
                extract_f0_mode=VocodeMode.WORLD,
            )
            self._realtime_vocoder.create_synthesizer(
                buffer_size=1024,
                number_of_pointers=16,
            )
        return self._realtime_vocoder

    @property
    def models(self):
        if self._models is None:
            f0_converter = F0Converter(
                input_statistics=self.input_statistics_path,
                target_statistics=self.target_statistics_path,
            )

            ac_config = self.ac_config
            sr_config = self.sr_config

            acoustic_converter = AcousticConverter(
                ac_config,
                self.stage1_model_path,
                f0_converter=f0_converter,
                out_sampling_rate=self.out_sampling_rate,
            )
            super_resolution = SuperResolution(
                sr_config,
                self.stage2_model_path,
            )
            self._models = acoustic_converter, super_resolution
        return self._models

    @property
    def voice_changer(self):
        if self._voice_changer is None:
            acoustic_converter, super_resolution = self.models
            self._voice_changer = VoiceChanger(
                acoustic_converter=acoustic_converter,
                super_resolution=super_resolution,
                output_sampling_rate=self.out_sampling_rate,
            )
        return self._voice_changer

    @property
    def encode_stream(self):
        if self._encode_stream is None:
            self._encode_stream = EncodeStream(vocoder=self.realtime_vocoder)
        return self._encode_stream

    @property
    def convert_stream(self):
        if self._convert_stream is None:
            self._convert_stream = ConvertStream(
                voice_changer=self.voice_changer, )
        return self._convert_stream

    @property
    def decode_stream(self):
        if self._decode_stream is None:
            self._decode_stream = DecodeStream(vocoder=self.realtime_vocoder, )
        return self._decode_stream

    def _load_wave_and_split(self, time_length: float = 1):
        rate = self.ac_config.dataset.acoustic_param.sampling_rate
        length = round(time_length * rate)
        wave, _ = librosa.load(Path('tests/data/audioA.wav'), sr=rate)
        return [
            wave[i * length:(i + 1) * length]
            for i in range(len(wave) // length)
        ]

    def _encode(self, w: numpy.ndarray):
        wave = Wave(wave=w, sampling_rate=self.input_rate)
        feature_wrapper = self.realtime_vocoder.encode(wave)
        return feature_wrapper

    def _convert(self, feature_wrapper: AcousticFeatureWrapper):
        feature = self.voice_changer.convert_from_acoustic_feature(
            feature_wrapper)
        return feature

    def test_initialize(self):
        pass

    def test_load_model(self):
        acoustic_converter, super_resolution = self.models
        self.assertNotEqual(acoustic_converter, None)
        self.assertNotEqual(super_resolution, None)

    def test_load_wave(self):
        wave_segments = self._load_wave_and_split()
        self.assertEqual(len(wave_segments[0]),
                         self.ac_config.dataset.acoustic_param.sampling_rate)

    def test_encode_stream(self):
        waves = self._load_wave_and_split()
        encode_stream = self.encode_stream

        encode_stream.add(start_time=0, data=waves[0])
        encode_stream.add(start_time=1, data=waves[1])

        # pick
        output = encode_stream.process(start_time=0,
                                       time_length=1,
                                       extra_time=0)
        target = self._encode(waves[0])
        self.assertEqual(output, target)

        # concat
        output = encode_stream.process(start_time=0.3,
                                       time_length=1,
                                       extra_time=0)
        target = self._encode(
            numpy.concatenate([
                waves[0][self.input_rate * 3 // 10:],
                waves[1][:self.input_rate * 3 // 10],
            ]))
        self.assertEqual(output, target)

        # pad
        output = encode_stream.process(start_time=1.3,
                                       time_length=1,
                                       extra_time=0)
        target = self._encode(
            numpy.concatenate([
                waves[1][self.input_rate * 3 // 10:],
                numpy.zeros(self.input_rate * 3 // 10),
            ]))
        self.assertEqual(output, target)

    def test_convert_stream(self):
        waves = self._load_wave_and_split()
        convert_stream = self.convert_stream

        convert_stream.add(start_time=0, data=self._encode(waves[0]))
        convert_stream.add(start_time=1, data=self._encode(waves[1]))

        # pick
        output = convert_stream.process(start_time=0,
                                        time_length=1,
                                        extra_time=0)
        target = self._convert(self._encode(waves[0]))
        self.assertTrue(equal_feature(output, target))

    def test_all_stream(self):
        num_data = 10
        time_length = 0.3

        def _add(_stream: BaseStream, _datas):
            for i, data in zip(range(num_data), _datas):
                _stream.add(start_time=i * time_length, data=data)

        def _split_process(_stream: BaseStream, _extra_time: float):
            return [
                _stream.process(start_time=i * time_length,
                                time_length=time_length,
                                extra_time=_extra_time)
                for i in range(num_data)
            ]

        def _join_process(_stream: BaseStream, _extra_time: float):
            return _stream.process(start_time=0,
                                   time_length=time_length * num_data,
                                   extra_time=_extra_time)

        def _process_all_stream(
            _streams: Tuple[BaseStream, BaseStream, BaseStream],
            _datas,
            _split_flags: Tuple[bool, bool, bool],
            _extra_times: Tuple[float, float, float],
        ):
            for stream, split_flag, extra_time in zip(_streams, _split_flags,
                                                      _extra_times):
                _add(stream, _datas)
                if split_flag:
                    _datas = _split_process(stream, _extra_time=extra_time)
                else:
                    _datas = [_join_process(stream, _extra_time=extra_time)]
            return _datas

        def _concat_and_save(_waves, _path: str):
            wave = numpy.concatenate(_waves).astype(numpy.float32)
            librosa.output.write_wav(_path, wave, self.out_sampling_rate)

        def _remove(_streams: Tuple[BaseStream, BaseStream, BaseStream]):
            for stream in _streams:
                stream.remove(end_time=num_data)

        waves = self._load_wave_and_split(time_length=time_length)[:num_data]
        encode_stream = self.encode_stream
        convert_stream = self.convert_stream
        decode_stream = self.decode_stream

        streams = (encode_stream, convert_stream, decode_stream)

        # datas = _process_all_stream(streams, waves, _split_flags=(True, True, True), _extra_times=(0, 0, 0))
        # _concat_and_save(datas, '../test_all_split.wav')
        # _remove(streams)
        #
        # datas= _process_all_stream(streams, waves, _split_flags=(False, True, True), _extra_times=(0, 0, 0))
        # _concat_and_save(datas, '../test_encode_join.wav')
        # _remove(streams)
        #
        # datas = _process_all_stream(streams, waves, _split_flags=(True, False, True), _extra_times=(0, 0, 0))
        # _concat_and_save(datas, '../test_convert_join.wav')
        # _remove(streams)
        #
        datas = _process_all_stream(streams,
                                    waves,
                                    _split_flags=(True, True, True),
                                    _extra_times=(0, 1, 0))
        _concat_and_save(datas, '../test_convert_extra05.wav')
        _remove(streams)