def __init__(self, add_frame_features=False, subphone_features=None, use_phone_alignment=False, question_path=None): self.add_frame_features = add_frame_features self.subphone_features = subphone_features self.test_paths = None self.use_phone_alignment = use_phone_alignment if question_path is None: self.binary_dict, self.continuous_dict = hts.load_question_set( join(DATA_ROOT, "questions-radio_dnn_416.hed")) else: self.binary_dict, self.continuous_dict = hts.load_question_set( question_path)
def __init__(self, wav_root, label_root, question_path, use_harvest=True, f0_floor=150, f0_ceil=700, frame_period=5, mgc_order=59): self.wav_root = wav_root self.label_root = label_root self.binary_dict, self.continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) self.pitch_idx = len(self.binary_dict) + 1 self.use_harvest = use_harvest self.f0_floor = f0_floor self.f0_ceil = f0_ceil self.frame_period = frame_period self.mgc_order = mgc_order self.windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), (1, 1, np.array([1.0, -2.0, 1.0])), ]
def _process_feature(out_dir, index, label_path, add_frame_features=False, subphone_features=None, question_path=None): labels = hts.load(label_path) binary_dict, continuous_dict = hts.load_question_set(question_path) features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=add_frame_features, subphone_features=subphone_features) n_frames = len(features) if add_frame_features: indices = labels.silence_frame_indices().astype(np.int) else: indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) voiced_frames = features.shape[0] # Write the linguistic to disk: linguistic_filename = 'arctic_%05d.npy' % index np.save(os.path.join(out_dir, linguistic_filename), features.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (linguistic_filename, n_frames, voiced_frames)
def get_linguistic_feature(lab_path, question_path, level='phone'): if level == 'phone': add_frame_features = False subphone_features = None elif level == 'frame': add_frame_features = True subphone_features = 'coarse_coding' else: raise ValueError( f'phone and frame are supported, but level={level} is given.') binary_dict, continuous_dict = hts.load_question_set(question_path) labels = hts.load(lab_path) feature = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=add_frame_features, subphone_features=subphone_features) if add_frame_features: indices = labels.silence_frame_indices().astype(int) else: indices = labels.silence_phone_indices() feature = np.delete(feature, indices, axis=0) return feature.astype(np.float32)
def test_linguistic_and_duration_features_for_duration_model(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) # Phone-level linguistic features # Linguistic features input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) assert labels.is_state_alignment_label() x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None) y = np.fromfile(join(DATA_DIR, "binary_label_416", "arctic_a0001.lab"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y) # Duration features labels = hts.load(input_state_label) x = fe.duration_features(labels, feature_type="numerical", unit_size="state", feature_size="phoneme") y = np.fromfile(join(DATA_DIR, "duration_untrimmed", "arctic_a0001.dur"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y)
def test_singing_voice_question(): # Test SVS case """ QS "L-Phone_Yuusei_Boin" {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*} CQS "e1" {/E:(\\NOTE)]} """ binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "test_jp_svs.hed"), append_hat_for_LL=False, convert_svs_pattern=True) input_phone_label = join(DATA_DIR, "song070_f00001_063.lab") labels = hts.load(input_phone_label) feats = fe.linguistic_features(labels, binary_dict, continuous_dict) assert feats.shape == (74, 3) # CQS e1: get the current MIDI number C_e1 = continuous_dict[0] for idx, lab in enumerate(labels): context = lab[-1] if C_e1.search(context) is not None: from nnmnkwii.frontend import NOTE_MAPPING assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1] # CQS e57: get pitch diff # In contrast to other continous features, the pitch diff has a prefix "m" or "p" # to indiecate th sign of numbers. C_e57 = continuous_dict[1] for idx, lab in enumerate(labels): context = lab[-1] if "~p2+" in context: assert C_e57.search(context).group(1) == "p2" assert feats[idx, 2] == 2 if "~m2+" in context: assert C_e57.search(context).group(1) == "m2" assert feats[idx, 2] == -2
def __init__(self, utt_list, wav_root, label_root, question_path, use_harvest=True, f0_floor=150, f0_ceil=700, frame_period=5, mgc_order=59, num_windows=3, relative_f0=True, interp_unvoiced_aperiodicity=True): self.utt_list = utt_list self.wav_root = wav_root self.label_root = label_root self.binary_dict, self.continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) self.pitch_idx = len(self.binary_dict) + 1 self.use_harvest = use_harvest self.f0_floor = f0_floor self.f0_ceil = f0_ceil self.frame_period = frame_period self.mgc_order = mgc_order self.relative_f0 = relative_f0 self.interp_unvoiced_aperiodicity = interp_unvoiced_aperiodicity self.windows = get_windows(num_windows)
def __init__(self, data_root, max_files=None, add_frame_features=False, subphone_features=None): self.data_root = data_root self.max_files = max_files self.add_frame_features = add_frame_features self.subphone_features = subphone_features self.binary_dict, self.continuous_dict = hts.load_question_set( hp_acoustic.question_path)
def test_labels_number_of_frames(): # https://github.com/r9y9/nnmnkwii/issues/85 binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "jp.hed")) labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab")) linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True) assert labels.num_frames() == linguistic_features.shape[0]
def load_qst(question_path, append_hat_for_LL=False) -> tuple: """ question.hed ファイルを読み取って、 binary_dict, continuous_dict, pitch_idx, pitch_indices を返す。 """ binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=append_hat_for_LL) pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) pitch_idx = len(binary_dict) + 1 return (binary_dict, continuous_dict, pitch_indices, pitch_idx)
def __init__(self, utt_list, data_root, question_path, add_frame_features=False, subphone_features=None, log_f0_conditioning=True): self.data_root = data_root self.utt_list = utt_list self.add_frame_features = add_frame_features self.subphone_features = subphone_features self.binary_dict, self.continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) self.log_f0_conditioning = log_f0_conditioning self.pitch_idx = np.arange(len(self.binary_dict), len(self.binary_dict)+3)
def __init__(self, data_root=None, question_path=None, max_num_files=-1): # Build *.lab file list self.input_lab_files = sorted(glob.glob(DATA_ROOT + "/label_phone_align/*.lab")) # All *.lab files into list if max_num_files is not -1 : self.input_lab_files = self.input_lab_files[:max_num_files] #self.file_ids = [path.split('/')[-1][:-4] for path in input_lab_files] # File names without extensions in a list # Build dictionary from *.hed question file self.binary_dict, self.continuous_dict = hts.load_question_set(question_path) return None
def synthesis(config, device, label_path, question_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler): # load labels and question labels = hts.load(label_path).round_() binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features # TODO: configuarable pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) log_f0_conditioning = config.log_f0_conditioning if config.ground_truth_duration: # Use provided alignment duration_modified_labels = labels else: # Time-lag lag = predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning, config.timelag.allowed_range) # Timelag predictions durations = predict_duration(device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) # Predict acoustic features acoustic_features = predict_acoustic( device, duration_modified_labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning) # Waveform generation generated_waveform = gen_waveform( duration_modified_labels, acoustic_features, binary_dict, continuous_dict, acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, config.acoustic.subphone_features, log_f0_conditioning, pitch_idx, acoustic_config.num_windows, config.acoustic.post_filter, config.sample_rate, config.frame_period, config.acoustic.relative_f0) return generated_waveform
def test_htk_style_question_basics(): binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "test_question.hed")) # sil k o n i ch i w a sil input_phone_label = join(DATA_DIR, "hts-nit-atr503", "phrase01.lab") labels = hts.load(input_phone_label) # Test if we can handle wildcards correctly # also test basic phon contexts (LL, L, C, R, RR) """ QS "LL-Phone_Muon1" {sil^,pau^} # without wildcards (*) QS "LL-Phone_Muon2" {sil^*,pau^*} # with *, should be equivalent with above QS "L-Phone_Muon1" {*^sil-*,*^pau-*} QS "C-Phone_sil" {*-sil+*} QS "R-Phone_o" {*+o=*} QS "RR-Phone_o" {*=o/A:*} """ LL_muon1 = binary_dict[0][0] LL_muon2 = binary_dict[1][0] L_muon1 = binary_dict[2][0] C_sil = binary_dict[3][0] R_phone_o = binary_dict[4][0] RR_phone_o = binary_dict[5][0] # xx^xx-sil+k=o label = labels[0][-1] assert LL_muon1.search(label) is None assert LL_muon2.search(label) is None assert L_muon1.search(label) is None assert C_sil.search(label) assert R_phone_o.search(label) is None assert RR_phone_o.search(label) # xx^sil-k+o=N label = labels[1][-1] assert LL_muon1.search(label) is None assert LL_muon2.search(label) is None assert L_muon1.search(label) assert C_sil.search(label) is None assert R_phone_o.search(label) assert RR_phone_o.search(label) is None # sil^k-o+N=n label = labels[2][-1] assert LL_muon1.search(label) assert LL_muon2.search(label) assert L_muon1.search(label) is None assert C_sil.search(label) is None assert R_phone_o.search(label) is None assert RR_phone_o.search(label) is None # Slice/list indexing assert str(labels[:2]) == str(labels[[0, 1]])
def test_correct_vuv_by_phone(): wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav" lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab" binary_dict, numeric_dict = hts.load_question_set( Path(__file__).parent / "data" / "jp_test.hed" ) labels = hts.load(lab_path) sr, wav = wavfile.read(wav_path) wav = wav.astype(np.float64) assert sr == 48000 out_feats, stream_sizes = _extract_static_feats(wav, sr) has_dynamic_features = [False] * len(stream_sizes) pitch_idx = len(binary_dict) + 1 linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features="coarse_coding", ) params = { "labels": labels, "acoustic_features": out_feats, "binary_dict": binary_dict, "numeric_dict": numeric_dict, "stream_sizes": stream_sizes, "has_dynamic_features": has_dynamic_features, "pitch_idx": pitch_idx, "relative_f0": False, "frame_period": 5, } out_vuv_idx = 61 vuv = out_feats[:, out_vuv_idx : out_vuv_idx + 1] vuv_corrected = correct_vuv_by_phone(vuv, binary_dict, linguistic_features) # by correcting VUV should make a difference _, _, vuv_fixed, _ = gen_spsvs_static_features(**{**params, "force_fix_vuv": True}) assert np.any(vuv_corrected != vuv) # 0: Rest 1: Voiced 2: Unvoiced rest_idx = 0 voiced_idx = 1 unvoiced_idx = 2 assert np.all(vuv_corrected[linguistic_features[:, rest_idx] > 0] < 0.5) assert np.all(vuv_corrected[linguistic_features[:, voiced_idx] > 0] > 0.5) assert np.all(vuv_corrected[linguistic_features[:, unvoiced_idx] > 0] < 0.5)
def test_invalid_linguistic_features(): binary_dict, continuous_dict = hts.load_question_set( example_question_file()) phone_labels = hts.load(example_label_file(phone_level=True)) state_labels = hts.load(example_label_file(phone_level=False)) @raises(ValueError) def __test(labels, subphone_features, add_frame_features): fe.linguistic_features(labels, binary_dict, continuous_dict, subphone_features=subphone_features, add_frame_features=add_frame_features) yield __test, phone_labels, "full", True yield __test, phone_labels, "full", False yield __test, state_labels, "full", False
def __init__(self, duration_model, acoustic_model, question_file, config, duration_dataset=None, acoustic_dataset=None, device='cpu'): self.duration_model = duration_model self.acoustic_model = acoustic_model self.duration_dataset = duration_dataset self.acoustic_dataset = acoustic_dataset self.bin_dict, self.con_dict = hts.load_question_set(question_file) self.config = config self.device = device
def test_linguistic_features_for_acoustic_model(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) # Linguistic features # To train acoustic model paired with linguistic features, # we need frame-level linguistic feature representation. input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) assert labels.is_state_alignment_label() x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features="full") y = np.fromfile(join(DATA_DIR, "binary_label_425", "arctic_a0001.lab"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y)
def __init__( self, utt_list, wav_root, label_root, question_path, use_harvest=True, f0_floor=150, f0_ceil=700, frame_period=5, mgc_order=59, num_windows=3, relative_f0=True, interp_unvoiced_aperiodicity=True, vibrato_mode="none", # diff, sine sample_rate=48000, d4c_threshold=0.85, trajectory_smoothing=False, trajectory_smoothing_cutoff=50, correct_vuv=False, ): self.utt_list = utt_list self.wav_root = wav_root self.label_root = label_root self.binary_dict, self.continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False ) self.pitch_idx = len(self.binary_dict) + 1 self.use_harvest = use_harvest self.f0_floor = f0_floor self.f0_ceil = f0_ceil self.frame_period = frame_period self.mgc_order = mgc_order self.relative_f0 = relative_f0 self.interp_unvoiced_aperiodicity = interp_unvoiced_aperiodicity self.vibrato_mode = vibrato_mode self.windows = get_windows(num_windows) self.sample_rate = sample_rate self.d4c_threshold = d4c_threshold self.trajectory_smoothing = trajectory_smoothing self.trajectory_smoothing_cutoff = trajectory_smoothing_cutoff self.correct_vuv = correct_vuv
def test_singing_voice_question(): # Test SVS case """ QS "L-Phone_Yuusei_Boin" {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*} CQS "e1" {/E:(\\NOTE)]} """ binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "test_jp_svs.hed"), append_hat_for_LL=False) input_phone_label = join(DATA_DIR, "song070_f00001_063.lab") labels = hts.load(input_phone_label) feats = fe.linguistic_features(labels, binary_dict, continuous_dict) assert feats.shape == (74, 2) # CQS e1: get the current MIDI number C_e1 = continuous_dict[0] for idx, lab in enumerate(labels): context = lab[-1] if C_e1.search(context) is not None: from nnmnkwii.frontend import NOTE_MAPPING assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1]
def test_silence_frame_removal_given_hts_labels(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features="full") # Remove silence frames indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) y = np.fromfile(join(DATA_DIR, "nn_no_silence_lab_425", "arctic_a0001.lab"), dtype=np.float32).reshape(-1, features.shape[-1]) assert features.shape == y.shape assert np.allclose(features, y)
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] speakers = available_speakers wd = WavFileDataSource(in_dir, speakers=speakers) td = TranscriptionFileDataSource(in_dir, speakers=speakers) wav_paths = wd.collect_files() lab_paths = td.collect_files() speaker_ids = wd.labels binary_dict, continuous_dict = hts.load_question_set( join(in_dir, "questions", hparams.question_fn)) result = [] for index, (speaker_id, wav_path, lab_path) in enumerate(zip(speaker_ids, wav_paths, lab_paths)): result.append( _process_utterance(out_dir, index + 1, speaker_id, wav_path, lab_path, binary_dict, continuous_dict, "N/A")) return result
def test_gen_spsvs_static_features(): wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav" lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab" binary_dict, numeric_dict = hts.load_question_set( Path(__file__).parent / "data" / "jp_test.hed" ) labels = hts.load(lab_path) sr, wav = wavfile.read(wav_path) wav = wav.astype(np.float64) assert sr == 48000 out_feats, stream_sizes = _extract_static_feats(wav, sr) has_dynamic_features = [False] * len(stream_sizes) pitch_idx = len(binary_dict) + 1 params = { "labels": labels, "acoustic_features": out_feats, "binary_dict": binary_dict, "numeric_dict": numeric_dict, "stream_sizes": stream_sizes, "has_dynamic_features": has_dynamic_features, "pitch_idx": pitch_idx, "relative_f0": False, "frame_period": 5, "force_fix_vuv": False, } mgc, lf0, vuv, bap = gen_spsvs_static_features(**params) assert mgc.shape[1] == 60 assert lf0.shape[1] == 1 assert vuv.shape[1] == 1 assert bap.shape[1] == 5 # w/o V/UV correction, vuv should't change out_vuv_idx = 61 N = np.abs(vuv - out_feats[:, out_vuv_idx : out_vuv_idx + 1]).sum() assert int(N) == 0
def __init__( self, duration_model, acoustic_model, scaler, config, speakers, question_path, device, ): self.duration_model = duration_model self.acoustic_model = acoustic_model self.scaler = scaler self.binary_dict, self.continuous_dict = hts.load_question_set(question_path) self.device = device self.config = config # prepare window if config['n_delta'] in (0, 1, 2): windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), (1, 1, np.array([1.0, -2.0, 1.0])), ] self.windows = windows[:config['n_delta'] + 1] else: raise ValueError('only n_delta = 0, 1, 2 is supported') self.lf0_start_idx = (config['mcep_order'] + 1) * (config['n_delta'] + 1) self.vuv_start_idx = self.lf0_start_idx + (config['n_delta'] + 1) self.bap_start_idx = self.vuv_start_idx + 1 # prepare speaker codes dictionary with open(speakers, 'r') as f: speakers = [item.strip() for item in f] speaker_codes = np.eye(len(speakers), dtype=np.float32) self.speaker_codes = {speakers[i]: speaker_codes[i] for i in range(len(speakers))}
def test_phone_alignment_label(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) input_state_label = join(DATA_DIR, "label_phone_align", "arctic_a0001.lab") labels = hts.load(input_state_label) x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None) assert not labels.is_state_alignment_label() assert np.all(np.isfinite(x)) for subphone_features in ["coarse_coding", "minimal_phoneme"]: x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) assert np.all(np.isfinite(x)) x = fe.duration_features(labels) assert np.all(np.isfinite(x))
from nnmnkwii.frontend import merlin as fe from nnmnkwii.postfilters import merlin_post_filter import gantts from gantts.multistream import multi_stream_mlpg, get_static_features from gantts.multistream import get_static_stream_sizes, select_streams from gantts.seqloss import MaskedMSELoss, sequence_mask from hparams import tts_acoustic as hp_acoustic from hparams import tts_duration as hp_duration from train import NPYDataSource use_cuda = torch.cuda.is_available() binary_dict, continuous_dict = hts.load_question_set(hp_acoustic.question_path) def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True): mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = hp_acoustic.windows ty = "acoustic" # MGE training
from nnmnkwii.postfilters import merlin_post_filter import gantts from gantts.multistream import multi_stream_mlpg, get_static_features from gantts.multistream import get_static_stream_sizes, select_streams from gantts.seqloss import MaskedMSELoss, sequence_mask from hparams import tts_acoustic as hp_acoustic from hparams import tts_duration as hp_duration from train import NPYDataSource use_cuda = torch.cuda.is_available() binary_dict, continuous_dict = hts.load_question_set(hp_acoustic.question_path) def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True): mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = hp_acoustic.windows ty = "acoustic" # MGE training
def acoustic2world(config: DictConfig, path_timing, path_acoustic, path_f0, path_spcetrogram, path_aperiodicity): """ Acousticの行列のCSVを読んで、WAVファイルとして出力する。 """ # loggerの設定 global logger # pylint: disable=global-statement logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) # load labels and question duration_modified_labels = hts.load(path_timing).round_() # CUDAが使えるかどうか # device = 'cuda' if torch.cuda.is_available() else 'cpu' # 各種設定を読み込む typ = 'acoustic' model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml)) # hedファイルを読み取る。 question_path = to_absolute_path(config.question_path) # hts2wav.pyだとこう↓----------------- # これだと各モデルに別個のhedを適用できる。 # if config[typ].question_path is None: # config[typ].question_path = config.question_path # -------------------------------------- # hedファイルを辞書として読み取る。 binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features pitch_idx = len(binary_dict) + 1 # pitch_indices = np.arange(len(binary_dict), len(binary_dict)+3) # pylint: disable=no-member # Acousticの数値を読み取る acoustic_features = np.loadtxt(path_acoustic, delimiter=',', dtype=np.float64) # AcousticからWORLD用のパラメータを取り出す。 f0, spectrogram, aperiodicity = gen_world_params( duration_modified_labels, acoustic_features, binary_dict, continuous_dict, model_config.stream_sizes, model_config.has_dynamic_features, subphone_features=config.acoustic.subphone_features, pitch_idx=pitch_idx, num_windows=model_config.num_windows, post_filter=config.acoustic.post_filter, sample_rate=config.sample_rate, frame_period=config.frame_period, relative_f0=config.acoustic.relative_f0, vibrato_scale=1.0, vuv_threshold=0.3) # csvファイルとしてf0の行列を出力 for path, array in ((path_f0, f0), (path_spcetrogram, spectrogram), (path_aperiodicity, aperiodicity)): np.savetxt(path, array, fmt='%.16f', delimiter=',')
def test_load_question_set(): binary_dict, continuous_dict = hts.load_question_set( example_question_file()) assert len(binary_dict) + len(continuous_dict) == 416
def _score2duration(config: DictConfig, labels): """ full_score と timelag ラベルから durationラベルを生成する。 """ # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # loggerの設定 global logger # pylint: disable=global-statement logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) typ = 'duration' # CUDAが使えるかどうか device = 'cuda' if torch.cuda.is_available() else 'cpu' # maybe_set_checkpoints_(config) のかわり set_checkpoint(config, typ) # maybe_set_normalization_stats_(config) のかわり set_normalization_stat(config, typ) # 各種設定を読み込む model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(config[typ].checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict']) in_scaler = joblib.load(config[typ].in_scaler_path) out_scaler = joblib.load(config[typ].out_scaler_path) model.eval() # ----------------------------------------------------- # ここまで nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.synthesis() の内容 ----- # ----------------------------------------------------- # full_score_lab を読み取る。 # labels = hts.load(score_path).round_() # いまのduraitonモデルだと使わない # timelag = hts.load(timelag_path).round_() # hedファイルを読み取る。 question_path = to_absolute_path(config.question_path) # hts2wav.pyだとこう↓----------------- # これだと各モデルに別個のhedを適用できる。 # if config[typ].question_path is None: # config[typ].question_path = config.question_path # -------------------------------------- # hedファイルを辞書として読み取る。 binary_dict, continuous_dict = \ hts.load_question_set(question_path, append_hat_for_LL=False) # pitch indices in the input features # pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) # f0の設定を読み取る。 log_f0_conditioning = config.log_f0_conditioning # durationモデルを適用 duration = predict_duration(device, labels, model, model_config, in_scaler, out_scaler, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning, force_clip_input_features=False) # durationのタプルまたはndarrayを返す return duration
def timing2acoustic(config: DictConfig, timing_path, acoustic_path): """ フルラベルを読み取って、音響特長量のファイルを出力する。 """ # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # loggerの設定 global logger # pylint: disable=global-statement logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) typ = 'acoustic' # CUDAが使えるかどうか device = 'cuda' if torch.cuda.is_available() else 'cpu' # maybe_set_checkpoints_(config) のかわり set_checkpoint(config, typ) # maybe_set_normalization_stats_(config) のかわり set_normalization_stat(config, typ) # 各種設定を読み込む model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(config[typ].checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict']) in_scaler = joblib.load(config[typ].in_scaler_path) out_scaler = joblib.load(config[typ].out_scaler_path) model.eval() # ----------------------------------------------------- # ここまで nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.synthesis() の内容 ----- # ----------------------------------------------------- # full_score_lab を読み取る。 duration_modified_labels = hts.load(timing_path).round_() # hedファイルを読み取る。 question_path = to_absolute_path(config.question_path) # hts2wav.pyだとこう↓----------------- # これだと各モデルに別個のhedを適用できる。 # if config[typ].question_path is None: # config[typ].question_path = config.question_path # -------------------------------------- # hedファイルを辞書として読み取る。 binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features # pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) # f0の設定を読み取る。 log_f0_conditioning = config.log_f0_conditioning acoustic_features = predict_acoustic(device, duration_modified_labels, model, model_config, in_scaler, out_scaler, binary_dict, continuous_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning) # csvファイルとしてAcousticの行列を出力 np.savetxt(acoustic_path, acoustic_features, delimiter=',')