def test_linguistic_and_duration_features_for_duration_model(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) # Phone-level linguistic features # Linguistic features input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) assert labels.is_state_alignment_label() x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None) y = np.fromfile(join(DATA_DIR, "binary_label_416", "arctic_a0001.lab"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y) # Duration features labels = hts.load(input_state_label) x = fe.duration_features(labels, feature_type="numerical", unit_size="state", feature_size="phoneme") y = np.fromfile(join(DATA_DIR, "duration_untrimmed", "arctic_a0001.dur"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y)
def collect_features(self, label_score_path, label_align_path): label_score = hts.load(label_score_path) label_align = hts.load(label_align_path) timelag = np.asarray(label_align.start_times) - np.asarray(label_score.start_times) # 100ns -> num frames timelag = timelag.astype(np.float32) / 50000 return timelag.reshape(-1, 1)
def collect_features(self, path): labels = hts.load(path) features = fe.duration_features(labels) indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) #print('DurationFeature:',features.shape) return features.astype(np.float32)
def _process_utterance(out_dir, index, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array: wav = dv3.audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) assert labels[0][-1] == "silB" assert labels[-1][-1] == "silE" b = int(labels[0][1] * 1e-7 * sr) e = int(labels[-1][0] * 1e-7 * sr) wav = wav[b:e] else: wav, _ = librosa.effects.trim(wav, top_db=30) # Compute the linear-scale spectrogram from the wav: spectrogram =dv3.audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'jsut-spec-%05d.npy' % index mel_filename = 'jsut-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def gen_duration(self, utt_id, label_path): # prepare phoneme-level linguistic feature labels = hts.load(lab_path) feature = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # normalize feature = self.scaler['X']['duration'].transform(feature) # add speaker information feature = self.add_speaker_code(utt_id, feature) # predict phoneme durations feature = torch.from_numpy(feature).to(device) duration = self.duration_model.predict(feature)['mean'].data.cpu().numpy() # denormalize duration = self.scaler['Y']['duration'].inverse_transform(duration) duration = np.round(duration) # set minimum duration to 1 duration[duration <= 0] = 1 labels.set_durations(duration) return labels
def _process_audio(out_dir, index, wav_path): sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if os.path.exists(lab_path): labels = hts.load(lab_path) assert labels[0][-1] == "silB" assert labels[-1][-1] == "silE" begin = int(labels[0][1] * 1e-7 * sr) end = int(labels[-1][0] * 1e-7 * sr) wav = wav[begin:end] else: wav, _ = librosa.effects.trim(wav, top_db=30) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: filename = 'jsut-target-%05d.tfrecords' % index write_preprocessed_target_data(index, spectrogram.T, mel_spectrogram.T, os.path.join(out_dir, filename)) # Return a tuple describing this training example: return TargetMetaData(index, filename, n_frames)
def get_linguistic_feature(lab_path, question_path, level='phone'): if level == 'phone': add_frame_features = False subphone_features = None elif level == 'frame': add_frame_features = True subphone_features = 'coarse_coding' else: raise ValueError( f'phone and frame are supported, but level={level} is given.') binary_dict, continuous_dict = hts.load_question_set(question_path) labels = hts.load(lab_path) feature = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=add_frame_features, subphone_features=subphone_features) if add_frame_features: indices = labels.silence_frame_indices().astype(int) else: indices = labels.silence_phone_indices() feature = np.delete(feature, indices, axis=0) return feature.astype(np.float32)
def gen_duration(device, label_path, binary_dict, continuous_dict, X_min, X_max, Y_mean, Y_scale, duration_model): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Apply normalization ty = "duration" duration_linguistic_features = minmax_scale(duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # # Apply model # # duration_model = duration_model.cpu() duration_model.eval() x = torch.FloatTensor(duration_linguistic_features) duration_predicted = duration_model(x.unsqueeze(0)).data.numpy() print("duration_predicted shape: {}".format(duration_predicted.shape)) # Apply denormalization duration_predicted = duration_predicted * Y_scale[ty] + Y_mean[ty] duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def _process_feature(out_dir, index, label_path, add_frame_features=False, subphone_features=None, question_path=None): labels = hts.load(label_path) binary_dict, continuous_dict = hts.load_question_set(question_path) features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=add_frame_features, subphone_features=subphone_features) n_frames = len(features) if add_frame_features: indices = labels.silence_frame_indices().astype(np.int) else: indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) voiced_frames = features.shape[0] # Write the linguistic to disk: linguistic_filename = 'arctic_%05d.npy' % index np.save(os.path.join(out_dir, linguistic_filename), features.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (linguistic_filename, n_frames, voiced_frames)
def test_one_utt(txt, duration_model, acoustic_model, post_filter=True): # Predict durations #txt = '中华人民共和国中央人民政府今天成立了' label = txt2label(txt) #hts_labels = hts.load(path=label_path) hts_labels = hts.load(lines=label) duration_modified_hts_labels = gen_duration(hts_labels, duration_model) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features="coarse_coding") # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) linguistic_features = X_acoustic_mms.transform(linguistic_features) if len(acoustic_model.inputs[0].shape) == 3: # RNN n1, n2 = linguistic_features.shape linguistic_features = linguistic_features.reshape(1, n1, n2) acoustic_predicted = acoustic_model.predict(linguistic_features) acoustic_predicted = acoustic_predicted.reshape( acoustic_predicted.shape[1], acoustic_predicted.shape[2]) else: acoustic_predicted = acoustic_model.predict(linguistic_features) acoustic_predicted = Y_acoustic_std.inverse_transform(acoustic_predicted) out = gen_waveform(acoustic_predicted, post_filter) out = out.astype(np.int16) return out
def process_lab(lab_files, out_dir, shift_in_cent): shift_in_note = args.shift_in_cent // 100 for lab_file in tqdm(lab_files): labels = hts.load(lab_file) name = basename(lab_file) new_contexts = [] for label in labels: context = label[-1] for pre, post in [("/D:", "!"), ("/E:", "]"), ("/F:", "#")]: match = re.search(f"{pre}([A-Z][b]?[0-9]+){post}", context) # if not "xx" if match is not None: assert len(match.groups()) == 1 note = match.group(0)[3:-1] note_index = NOTE_MAPPING[note] note_shifted = MIDI_MAPPING[note_index + shift_in_note] context = context.replace(match.group(0), f"{pre}{note_shifted}{post}", 1) new_contexts.append(context) labels.contexts = new_contexts postfix = str(shift_in_cent).replace("-", "minus") + "cent_aug" dst_lab_file = join(out_dir, name.replace(".lab", f"_{postfix}.lab")) with open(dst_lab_file, "w") as of: of.write(str(labels))
def test_hts_append(): lab_path = join(DATA_DIR, "BASIC5000_0001.lab") test_labels = hts.load(lab_path) print("\n{}".format(test_labels)) # should get same string representation labels = hts.HTSLabelFile() assert str(labels) == "" for label in test_labels: labels.append(label) assert str(test_labels) == str(labels) @raises(ValueError) def test_invalid_start_time(): l = hts.HTSLabelFile() l.append((100000, 0, "NG")) def test_succeeding_times(): l = hts.HTSLabelFile() l.append((0, 1000000, "OK")) l.append((1000000, 2000000, "OK")) @raises(ValueError) def test_non_succeeding_times(): l = hts.HTSLabelFile() l.append((0, 1000000, "OK")) l.append((1500000, 2000000, "NG")) test_invalid_start_time() test_succeeding_times() test_non_succeeding_times()
def test_singing_voice_question(): # Test SVS case """ QS "L-Phone_Yuusei_Boin" {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*} CQS "e1" {/E:(\\NOTE)]} """ binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "test_jp_svs.hed"), append_hat_for_LL=False, convert_svs_pattern=True) input_phone_label = join(DATA_DIR, "song070_f00001_063.lab") labels = hts.load(input_phone_label) feats = fe.linguistic_features(labels, binary_dict, continuous_dict) assert feats.shape == (74, 3) # CQS e1: get the current MIDI number C_e1 = continuous_dict[0] for idx, lab in enumerate(labels): context = lab[-1] if C_e1.search(context) is not None: from nnmnkwii.frontend import NOTE_MAPPING assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1] # CQS e57: get pitch diff # In contrast to other continous features, the pitch diff has a prefix "m" or "p" # to indiecate th sign of numbers. C_e57 = continuous_dict[1] for idx, lab in enumerate(labels): context = lab[-1] if "~p2+" in context: assert C_e57.search(context).group(1) == "p2" assert feats[idx, 2] == 2 if "~m2+" in context: assert C_e57.search(context).group(1) == "m2" assert feats[idx, 2] == -2
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std, post_filter=False, apply_duration_model=True, coef=1.4, fs=16000): duration_model, acoustic_model = models["duration"], models["acoustic"] if use_cuda: duration_model = duration_model.cuda() acoustic_model = acoustic_model.cuda() # Predict durations if apply_duration_model: duration_modified_hts_labels = gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std) else: duration_modified_hts_labels = hts.load(label_path) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=hp_acoustic.add_frame_features, subphone_features=hp_acoustic.subphone_features) # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = P.minmax_scale(linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features acoustic_model.eval() x = Variable(torch.from_numpy(linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy() acoustic_predicted = acoustic_predicted.reshape( -1, acoustic_predicted.shape[-1]) return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter, coef=coef, fs=fs)
def _process_feature(out_dir, index, wav_path, label_path): # get list of wav files wav_files = os.listdir(os.path.dirname(wav_path)) # check wav_file assert len( wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!" fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) n_frames = len(f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # get list of lab files lab_files = os.listdir(os.path.dirname(label_path)) # check wav_file assert len( lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!" # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) voiced_frames = features.shape[0] # Write the acoustic to disk: acoustic_filename = 'arctic_%05d.npy' % index np.save(os.path.join(out_dir, acoustic_filename), features.astype(np.float32), allow_pickle=False) dataset_ids.append(acoustic_filename[:-4]) with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'), 'wb') as pklFile: pickle.dump(dataset_ids, pklFile) # Return a tuple describing this training example: return (acoustic_filename, n_frames, voiced_frames)
def test_invalid_linguistic_features(): binary_dict, continuous_dict = hts.load_question_set( example_question_file()) phone_labels = hts.load(example_label_file(phone_level=True)) state_labels = hts.load(example_label_file(phone_level=False)) @raises(ValueError) def __test(labels, subphone_features, add_frame_features): fe.linguistic_features(labels, binary_dict, continuous_dict, subphone_features=subphone_features, add_frame_features=add_frame_features) yield __test, phone_labels, "full", True yield __test, phone_labels, "full", False yield __test, state_labels, "full", False
def collect_features(self, path): # 1.Load labels --> 2.Load dict from question --> 3.Parse linguistic feat. labels = hts.load(path) features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features='coarse_coding') # subphone_feature = None or 'coarse_coding', coarse_coded_features[:,416,417,418,419] return features.astype(np.float32)
def test_labels_number_of_frames(): # https://github.com/r9y9/nnmnkwii/issues/85 binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "jp.hed")) labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab")) linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True) assert labels.num_frames() == linguistic_features.shape[0]
def collect_features(self, path): labels = hts.load(path) features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=self.add_frame_features, subphone_features=self.subphone_features) if self.log_f0_conditioning: for idx in self.pitch_idx: features[:, idx] = interp1d(_midi_to_hz(features, idx, True), kind="slinear") return features.astype(np.float32)
def test_invalid_duration_features(): phone_labels = hts.load(example_label_file(phone_level=True)) @raises(ValueError) def __test(labels, unit_size, feature_size): fe.duration_features(labels, unit_size=unit_size, feature_size=feature_size) yield __test, phone_labels, None, "frame"
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) < hparams.min_audio_length: return None # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] spectrogram_filename = 'spec-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, text, wav_path, speaker_id=None): # check whether singlespeaker_mode if speaker_id is None: return _process_utterance_single(out_dir, text, wav_path) # modified version of VCTK _process_utterance sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] # case if wave files across different speakers have the same naming format. # e.g. Recording0.wav spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name) mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def synthesis(config, device, label_path, question_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler): # load labels and question labels = hts.load(label_path).round_() binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features # TODO: configuarable pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) log_f0_conditioning = config.log_f0_conditioning if config.ground_truth_duration: # Use provided alignment duration_modified_labels = labels else: # Time-lag lag = predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning, config.timelag.allowed_range) # Timelag predictions durations = predict_duration(device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) # Predict acoustic features acoustic_features = predict_acoustic( device, duration_modified_labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning) # Waveform generation generated_waveform = gen_waveform( duration_modified_labels, acoustic_features, binary_dict, continuous_dict, acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, config.acoustic.subphone_features, log_f0_conditioning, pitch_idx, acoustic_config.num_windows, config.acoustic.post_filter, config.sample_rate, config.frame_period, config.acoustic.relative_f0) return generated_waveform
def test_htk_style_question_basics(): binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "test_question.hed")) # sil k o n i ch i w a sil input_phone_label = join(DATA_DIR, "hts-nit-atr503", "phrase01.lab") labels = hts.load(input_phone_label) # Test if we can handle wildcards correctly # also test basic phon contexts (LL, L, C, R, RR) """ QS "LL-Phone_Muon1" {sil^,pau^} # without wildcards (*) QS "LL-Phone_Muon2" {sil^*,pau^*} # with *, should be equivalent with above QS "L-Phone_Muon1" {*^sil-*,*^pau-*} QS "C-Phone_sil" {*-sil+*} QS "R-Phone_o" {*+o=*} QS "RR-Phone_o" {*=o/A:*} """ LL_muon1 = binary_dict[0][0] LL_muon2 = binary_dict[1][0] L_muon1 = binary_dict[2][0] C_sil = binary_dict[3][0] R_phone_o = binary_dict[4][0] RR_phone_o = binary_dict[5][0] # xx^xx-sil+k=o label = labels[0][-1] assert LL_muon1.search(label) is None assert LL_muon2.search(label) is None assert L_muon1.search(label) is None assert C_sil.search(label) assert R_phone_o.search(label) is None assert RR_phone_o.search(label) # xx^sil-k+o=N label = labels[1][-1] assert LL_muon1.search(label) is None assert LL_muon2.search(label) is None assert L_muon1.search(label) assert C_sil.search(label) is None assert R_phone_o.search(label) assert RR_phone_o.search(label) is None # sil^k-o+N=n label = labels[2][-1] assert LL_muon1.search(label) assert LL_muon2.search(label) assert L_muon1.search(label) is None assert C_sil.search(label) is None assert R_phone_o.search(label) is None assert RR_phone_o.search(label) is None # Slice/list indexing assert str(labels[:2]) == str(labels[[0, 1]])
def test_correct_vuv_by_phone(): wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav" lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab" binary_dict, numeric_dict = hts.load_question_set( Path(__file__).parent / "data" / "jp_test.hed" ) labels = hts.load(lab_path) sr, wav = wavfile.read(wav_path) wav = wav.astype(np.float64) assert sr == 48000 out_feats, stream_sizes = _extract_static_feats(wav, sr) has_dynamic_features = [False] * len(stream_sizes) pitch_idx = len(binary_dict) + 1 linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features="coarse_coding", ) params = { "labels": labels, "acoustic_features": out_feats, "binary_dict": binary_dict, "numeric_dict": numeric_dict, "stream_sizes": stream_sizes, "has_dynamic_features": has_dynamic_features, "pitch_idx": pitch_idx, "relative_f0": False, "frame_period": 5, } out_vuv_idx = 61 vuv = out_feats[:, out_vuv_idx : out_vuv_idx + 1] vuv_corrected = correct_vuv_by_phone(vuv, binary_dict, linguistic_features) # by correcting VUV should make a difference _, _, vuv_fixed, _ = gen_spsvs_static_features(**{**params, "force_fix_vuv": True}) assert np.any(vuv_corrected != vuv) # 0: Rest 1: Voiced 2: Unvoiced rest_idx = 0 voiced_idx = 1 unvoiced_idx = 2 assert np.all(vuv_corrected[linguistic_features[:, rest_idx] > 0] < 0.5) assert np.all(vuv_corrected[linguistic_features[:, voiced_idx] > 0] > 0.5) assert np.all(vuv_corrected[linguistic_features[:, unvoiced_idx] > 0] < 0.5)
def collect_features(self, path): labels = hts.load(path) features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=self.add_frame_features, subphone_features=self.subphone_features) if self.add_frame_features: indices = labels.silence_frame_indices().astype(np.int) else: indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def test_mono(): lab_path = join(DATA_DIR, "BASIC5000_0001.lab") labels = hts.load(lab_path) assert not labels.is_state_alignment_label() # Should detect begin/end sil regions sil_regex = re.compile("sil") for indices in [ labels.silence_label_indices(sil_regex), labels.silence_phone_indices(sil_regex)]: assert len(indices) == 2 assert indices[0] == 0 assert indices[1] == len(labels) - 1
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate filename = os.path.basename(wav_path).replace('.wav', '') # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) # Librosa trim seems to cut off the ending part of speech else: wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Save trimmed wav save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path) dir = os.path.dirname(save_wav_path) if not os.path.exists(dir): os.system('mkdir {} -p'.format(dir)) audio.save_wav(wav, save_wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = '{}-spec.npy'.format(filename) mel_filename = '{}-mel.npy'.format(filename) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std, post_filter=False, apply_duration_model=True, coef=1.4, fs=16000, mge_training=True): duration_model, acoustic_model = models["duration"], models["acoustic"] if use_cuda: duration_model = duration_model.cuda() acoustic_model = acoustic_model.cuda() # Predict durations if apply_duration_model: duration_modified_hts_labels = gen_duration( label_path, duration_model, X_min, X_max, Y_mean, Y_std) else: duration_modified_hts_labels = hts.load(label_path) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=hp_acoustic.add_frame_features, subphone_features=hp_acoustic.subphone_features) # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = P.minmax_scale( linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features acoustic_model.eval() x = Variable(torch.from_numpy(linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy() acoustic_predicted = acoustic_predicted.reshape(-1, acoustic_predicted.shape[-1]) return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter, coef=coef, fs=fs, mge_training=mge_training)
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=hp_duration.add_frame_features, subphone_features=hp_duration.subphone_features).astype(np.float32) # Apply normali--post-filterzation ty = "duration" duration_linguistic_features = P.minmax_scale( duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Apply models duration_model.eval() # Apply model x = Variable(torch.from_numpy(duration_linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x duration_predicted = duration_model(x, [xl]).data.cpu().numpy() duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1]) # Apply denormalization duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty]) duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 # print(duration_predicted) duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def collect_features(self, path): labels = hts.load(path) features = fe.duration_features(labels) indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)