def calc_mcd( path1: Optional[Path] = None, path2: Optional[Path] = None, wave1: Optional[Wave] = None, wave2: Optional[Wave] = None, ): wave1 = Wave.load(path1) if wave1 is None else wave1 wave2 = Wave.load(path2) if wave2 is None else wave2 assert wave1.sampling_rate == wave2.sampling_rate sampling_rate = wave1.sampling_rate min_length = min(len(wave1.wave), len(wave2.wave)) wave1.wave = wave1.wave[:min_length] wave2.wave = wave2.wave[:min_length] mc1 = to_melcepstrum( x=wave1.wave, sampling_rate=sampling_rate, n_fft=2048, win_length=1024, hop_length=256, order=24, ) mc2 = to_melcepstrum( x=wave2.wave, sampling_rate=sampling_rate, n_fft=2048, win_length=1024, hop_length=256, order=24, ) return _mcd(mc1, mc2)
def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), f0=SamplingData.load(self.path_f0), phoneme=SamplingData.load(self.path_phoneme), )
def _process(path: Path, bit: int, gaussian_noise_sigma: float): wave = Wave.load(path).wave if gaussian_noise_sigma > 0: wave += numpy.random.randn(*wave.shape) * gaussian_noise_sigma encoded = encode_single(encode_mulaw(wave, mu=2 ** bit), bit=bit) return numpy.histogram(encoded, bins=2 ** bit, range=(0, 2 ** bit))[0].astype( numpy.uint64 )
def calc_silence_rate( path1: Optional[Path] = None, path2: Optional[Path] = None, wave1: Optional[Wave] = None, wave2: Optional[Wave] = None, ): wave1 = Wave.load(path1) if wave1 is None else wave1 wave2 = Wave.load(path2) if wave2 is None else wave2 assert wave1.sampling_rate == wave2.sampling_rate silence1 = ~librosa.effects._signal_to_frame_nonsilent(wave1.wave) silence2 = ~librosa.effects._signal_to_frame_nonsilent(wave2.wave) tp = numpy.logical_and(silence1, silence2).sum(dtype=float) tn = numpy.logical_and(~silence1, ~silence2).sum(dtype=float) fn = numpy.logical_and(silence1, ~silence2).sum(dtype=float) fp = numpy.logical_and(~silence1, silence2).sum(dtype=float) accuracy = (tp + tn) / (tp + tn + fn + fp) return accuracy
def generate(self): wave = Wave.load(self.path_wave) try: local = SamplingData.load(self.path_local) except: local_rate = 80 local_array = to_log_melspectrogram(wave=wave, rate=local_rate) local = SamplingData(array=local_array, rate=local_rate) with NamedTemporaryFile(suffix=".npy", delete=False) as f: self.path_local = Path(f.name) local.save(self.path_local) return Input( wave=wave, silence=SamplingData.load(self.path_silence), local=local, )
def process( input_paths: Tuple[Path, Path], output_dir: Path, ): input_wave, input_f0 = input_paths wave_data = Wave.load(input_wave) f0_data = F0.load(input_f0) y = wave_data.wave.astype(np.float64) sr = wave_data.sampling_rate f0 = np.exp(f0_data.array[:, 0].astype(np.float64)) if f0_data.with_vuv: f0[~f0_data.array[:, 1]] = 0 t = np.arange(0, len(f0), dtype=np.float64) / f0_data.rate sp = pyworld.cheaptrick(y, f0, t, sr) ap = pyworld.d4c(y, f0, t, sr) y = pyworld.synthesize(f0, sp, ap, sr) out = output_dir / f"{input_f0.stem}.wav" librosa.output.write_wav(out, y.astype(np.float32), sr)
def generate(self): return Input( wave=Wave.load(self.path_wave), silence=SamplingData.load(self.path_silence), local=SamplingData.load(self.path_local), )
def generate( model_dir: Path, model_iteration: Optional[int], model_config: Optional[Path], output_dir: Path, to_voiced_scaler: bool, to_f0_scaler: bool, to_phoneme_onehot: bool, batch_size: Optional[int], num_test: int, target_glob: Optional[str], use_gpu: bool, ): if model_config is None: model_config = model_dir / "config.yaml" output_dir.mkdir(exist_ok=True) save_arguments(output_dir / "arguments.yaml", generate, locals()) config = Config.from_dict(yaml.safe_load(model_config.open())) generator = Generator( config=config, predictor=_get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="predictor_", ), voiced_network=( None if not to_voiced_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="voiced_network_", ) ), f0_network=( None if not to_f0_scaler else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="f0_network_", ) ), phoneme_network=( None if not to_phoneme_onehot else _get_model_path( model_dir=model_dir, iteration=model_iteration, prefix="phoneme_network_", ) ), use_gpu=use_gpu, ) dataset = create_dataset(config.dataset)["test"] scale = numpy.prod(config.network.scale_list) if batch_size is None: batch_size = config.train.batch_size if isinstance(dataset, SpeakerWavesDataset): wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]] elif isinstance(dataset, WavesDataset): wave_paths = [data.path_wave for data in dataset.inputs[:num_test]] else: raise Exception() if target_glob is not None: wave_paths += list(map(Path, glob(target_glob))) for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"): waves = [Wave.load(p) for p in wps] arrays = [w.wave for w in waves] pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays] arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays] tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays] output = generator.generate( wave=concat_examples(tensors), to_voiced_scaler=to_voiced_scaler, to_f0_scaler=to_f0_scaler, to_phoneme_onehot=to_phoneme_onehot, ) for feature, p, w, l in zip(output, wps, waves, pad_lengths): feature = feature.T[: l // scale] data = SamplingData(array=feature, rate=w.sampling_rate // scale) data.save(output_dir / (p.stem + ".npy"))
def main(): model_dir: Path = arguments.model_dir model_iteration: int = arguments.model_iteration model_config: Path = arguments.model_config time_length: float = arguments.time_length gpu: int = arguments.gpu config = create_config(model_config) model_path = _get_predictor_model_path(model_dir, model_iteration) sr = config.dataset.sampling_rate model = create_predictor(config.model) chainer.serializers.load_npz(str(model_path), model) if gpu is not None: model.to_gpu(gpu) cuda.get_device_from_id(gpu).use() chainer.global_config.train = False chainer.global_config.enable_backprop = False wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))]) local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))]) assert len(wave_paths) == len(local_paths) np.random.RandomState(config.dataset.seed).shuffle(wave_paths) np.random.RandomState(config.dataset.seed).shuffle(local_paths) wave_path = wave_paths[0] local_path = local_paths[0] w_data = Wave.load(wave_path, sampling_rate=sr) l_data = SamplingData.load(local_path) length = int(sr * time_length) l_scale = int(sr // l_data.rate) l_sl = length // l_scale length = l_sl * l_scale w = w_data.wave[:length] l = l_data.array[:l_sl] coarse, fine = encode_16bit(w) c, f, hc, hf = model( c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis], f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis], l_array=model.xp.asarray(l)[np.newaxis], ) c = chainer.functions.softmax(c) c = chainer.cuda.to_cpu(c[0].data) f = chainer.cuda.to_cpu(f[0].data) fig = plt.figure(figsize=[32 * time_length, 10]) plt.imshow(c, aspect='auto', interpolation='nearest') plt.colorbar() plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true') plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted') plt.legend() fig.savefig('output.eps')
def collect_to_tfevents( input_dir: Path, output_dir: Optional[Path], filename_suffix: str, audio_tag_format: str, diff_tag: str, iteration_format: str, remove_exist: bool, expected_wave_dir: Optional[Path], ): if output_dir is None: output_dir = input_dir if remove_exist: for p in output_dir.glob(f"*tfevents*{filename_suffix}"): p.unlink() flag_calc_diff = expected_wave_dir is not None summary_writer = SummaryWriter(logdir=str(output_dir), filename_suffix=filename_suffix) diffs: DefaultDict[int, List[float]] = defaultdict(list) for p in tqdm(sorted(input_dir.rglob("*"), key=_to_nums), desc=input_dir.stem): if p.is_dir(): continue if "tfevents" in p.name: continue rp = p.relative_to(input_dir) iteration = int(iteration_format.format(p=p, rp=rp)) # audio if p.suffix in [".wav"]: wave, sr = librosa.load(str(p), sr=None) summary_writer.add_audio( tag=audio_tag_format.format(p=p, rp=rp), snd_tensor=wave, sample_rate=sr, global_step=iteration, ) # diff if flag_calc_diff and p.name.endswith("_woc.wav"): wave_id = p.name[:-8] expected = expected_wave_dir.joinpath(f"{wave_id}.wav") wo = Wave.load(p) wi = Wave.load(expected, sampling_rate=wo.sampling_rate) diff = calc_mcd(wave1=wi, wave2=wo) diffs[iteration].append(diff) if flag_calc_diff: for iteration, values in sorted(diffs.items()): summary_writer.add_scalar( tag=diff_tag, scalar_value=numpy.mean(values), global_step=iteration, ) summary_writer.close()
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)): with TemporaryDirectory() as d: tmp_dir = Path(d) input_audio_path = tmp_dir.joinpath("input.wav") input_audio_path.write_bytes(await wave.read()) # openjtalk phonemes = [ p.label for p in openjtalk_label_getter( text, openjtalk_command="open_jtalk", dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"), htsvoice_path=Path( "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice" ), output_wave_path=tmp_dir.joinpath("wave.wav"), output_log_path=tmp_dir.joinpath("log.txt"), output_type=OutputType.phoneme, without_span=False, ) ] # julius julius_audio_path = tmp_dir.joinpath("julius.wav") subprocess.check_call( f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split() ) julius_phonemes = [ p if p not in _jvs_to_julius else _jvs_to_julius[p] for p in phonemes if p != "sil" ] julius_dict_path = tmp_dir.joinpath("2nd.dict") julius_dict = sp_inserter.gen_julius_dict_2nd( " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm ) julius_dict_path.write_text(julius_dict) julius_dfa_path = tmp_dir.joinpath("2nd.dfa") julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n")) julius_dfa_path.write_text(julius_dfa) julius_output = sp_inserter.julius_phone_alignment( str(julius_audio_path), str(tmp_dir.joinpath("2nd")), _hmm_model, model_type=sp_inserter.ModelType.gmm, options=None, ) time_alignment_list = sp_inserter.frame_to_second( sp_inserter.get_time_alimented_list(julius_output) ) i_phoneme = 0 new_phonemes = [] for p in phonemes: if p == "pau" and time_alignment_list[i_phoneme][2] != "sp": continue i_phoneme += 1 new_phonemes.append(p) aligned = JvsPhoneme.convert( [ JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p) for p, o in zip(new_phonemes, time_alignment_list) ] ) for p in aligned: p.verify() # world f0 = F0.from_wave( Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64), frame_period=5.0, f0_floor=71.0, f0_ceil=800, with_vuv=False, f0_type=F0Type.world, ) converted_f0 = f0.convert( input_mean=f0.valid_f0_log.mean(), input_var=f0.valid_f0_log.var(), target_mean=_voiro_mean, target_var=f0.valid_f0_log.var(), ) converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1) # feature phoneme_array = LinguisticFeature( phonemes=aligned, phoneme_class=JvsPhoneme, rate=_feature_rate, feature_types=[LinguisticFeature.FeatureType.PHONEME], ).make_array() phoneme = SamplingData(array=phoneme_array, rate=_feature_rate) feature = SamplingData.collect( [converted_f0, phoneme], rate=_feature_rate, mode="min", error_time_length=0.015, ) return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))