def gen_waveform(labels, acoustic_features, acoustic_out_scaler, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): acoustic_features = multi_stream_mlpg( acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes, has_dynamic_features) static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) return generated_waveform
def _gen_static_features(model, model_config, in_feats, out_scaler): if model.prediction_type() == PredictionType.PROBABILISTIC: max_mu, max_sigma = model.inference(in_feats, [in_feats.shape[1]]) if np.any(model_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = (max_sigma.squeeze(0).cpu().data.numpy()**2 * out_scaler.var_) max_mu = out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # Apply MLPG # (T, D_out) -> (T, static_dim) out_feats = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) else: # (T, D_out) out_feats = max_mu.squeeze(0).cpu().data.numpy() out_feats = out_scaler.inverse_transform(out_feats) else: out_feats = (model.inference( in_feats, [in_feats.shape[1]]).squeeze(0).cpu().data.numpy()) out_feats = out_scaler.inverse_transform(out_feats) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): out_feats = multi_stream_mlpg( out_feats, out_scaler.var_, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) return out_feats.astype(np.float32)
def my_app(config : DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(config.pretty()) device = torch.device("cuda" if use_cuda else "cpu") in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) scaler = joblib.load(to_absolute_path(config.out_scaler_path)) in_feats = FileSourceDataset(NpyFileSource(in_dir)) with torch.no_grad(): for idx in tqdm(range(len(in_feats))): feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device) out = model(feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): windows = get_windows(3) out = multi_stream_mlpg( out, scaler.var_, windows, model_config.stream_sizes, model_config.has_dynamic_features) name = basename(in_feats.collected_files[idx][0]) out_path = join(out_dir, name) np.save(out_path, out, allow_pickle=False)
def predict_acoustic( device, labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, numeric_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True, force_clip_input_features=False, ): """Predict acoustic features from HTS labels MLPG is applied to the predicted features if the output features have dynamic features. Args: device (torch.device): device to use labels (HTSLabelFile): HTS labels acoustic_model (nn.Module): acoustic model acoustic_config (AcousticConfig): acoustic configuration acoustic_in_scaler (sklearn.preprocessing.StandardScaler): input scaler acoustic_out_scaler (sklearn.preprocessing.StandardScaler): output scaler binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary subphone_features (str): subphone feature type pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to use log f0 conditioning force_clip_input_features (bool): whether to force clip input features Returns: ndarray: predicted acoustic features """ # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features=subphone_features, ) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d( _midi_to_hz(linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if force_clip_input_features and isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(linguistic_features.shape[1]) if idx not in pitch_indices ] linguistic_features[:, non_pitch_indices] = np.clip( linguistic_features[:, non_pitch_indices], acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1], ) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = acoustic_model.inference(x, [x.shape[1]]) if np.any(acoustic_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * acoustic_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, ) else: # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) else: # (T, D_out) pred_acoustic = ( acoustic_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) if np.any(acoustic_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( pred_acoustic, acoustic_out_scaler.var_, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, ) return pred_acoustic
def predict_timelag( device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, numeric_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=None, allowed_range_rest=None, force_clip_input_features=False, ): """Predict time-lag from HTS labels Args: device (torch.device): device labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels timelag_model (nn.Module): time-lag model timelag_config (dict): time-lag model config timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler binary_dict (dict): binary feature dict numeric_dict (dict): numeric feature dict pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to condition on log f0 allowed_range (list): allowed range of time-lag allowed_range_rest (list): allowed range of time-lag for rest force_clip_input_features (bool): whether to clip input features Returns; ndarray: time-lag predictions """ if allowed_range is None: allowed_range = [-20, 20] if allowed_range_rest is None: allowed_range_rest = [-40, 40] # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, numeric_dict, add_frame_features=False, subphone_features=None, ).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d( _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features ) if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(timelag_linguistic_features.shape[1]) if idx not in pitch_indices ] timelag_linguistic_features[:, non_pitch_indices] = np.clip( timelag_linguistic_features[:, non_pitch_indices], timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1], ) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) else: # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) else: # (T, D_out) pred_timelag = ( timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1] ) else: pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range[0], allowed_range[1] ) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
def predict_duration( device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, binary_dict, numeric_dict, pitch_indices=None, log_f0_conditioning=True, force_clip_input_features=False, ): """Predict phoneme durations from HTS labels Args: device (torch.device): device to run the model on labels (nnmnkwii.io.hts.HTSLabelFile): labels duration_model (nn.Module): duration model duration_config (dict): duration config duration_in_scaler (sklearn.preprocessing.MinMaxScaler): duration input scaler duration_out_scaler (sklearn.preprocessing.MinMaxScaler): duration output scaler binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to use log-f0 conditioning force_clip_input_features (bool): whether to clip input features Returns: np.ndarray: predicted durations """ # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=False, subphone_features=None, ).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d( _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features ) if force_clip_input_features and isinstance(duration_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(duration_linguistic_features.shape[1]) if idx not in pitch_indices ] duration_linguistic_features[:, non_pitch_indices] = np.clip( duration_linguistic_features[:, non_pitch_indices], duration_in_scaler.feature_range[0], duration_in_scaler.feature_range[1], ) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if duration_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = duration_model.inference(x, [x.shape[1]]) if np.any(duration_config.has_dynamic_features): raise RuntimeError( "Dynamic features are not supported for duration modeling" ) # Apply denormalization max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * duration_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) return max_mu, max_sigma_sq else: # (T, D_out) pred_durations = ( duration_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) if np.any(duration_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( pred_durations, duration_out_scaler.var_, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features, ) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
def predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-20, 20], allowed_range_rest=[-40, 40]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) if isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range timelag_linguistic_features = np.clip( timelag_linguistic_features, timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1]) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * timelag_out_scaler.var_ max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_timelag = timelag_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]) else: pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0], allowed_range[1]) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
def predict_acoustic(device, labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True): # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d(_midi_to_hz( linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range linguistic_features = np.clip(linguistic_features, acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1]) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC: log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]]) if np.any(acoustic_config.has_dynamic_features): # (B, T, D_out) max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * acoustic_out_scaler.var_ max_mu = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_acoustic = acoustic_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) if np.any(acoustic_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( pred_acoustic, acoustic_out_scaler.var_, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) return pred_acoustic
def predict_duration(device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True): # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d(_midi_to_hz( duration_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features) if isinstance(duration_in_scaler, MinMaxScaler): # clip to feature range duration_linguistic_features = np.clip( duration_linguistic_features, duration_in_scaler.feature_range[0], duration_in_scaler.feature_range[1]) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if duration_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]]) if np.any(duration_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * duration_out_scaler.var_ max_mu = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_durations = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_durations = duration_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) if np.any(duration_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( pred_durations, duration_out_scaler.var_, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) device = torch.device("cuda" if use_cuda else "cpu") in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) scaler = joblib.load(to_absolute_path(config.out_scaler_path)) in_feats = FileSourceDataset(NpyFileSource(in_dir)) with torch.no_grad(): for idx in tqdm(range(len(in_feats))): feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device) if model.prediction_type() == PredictionType.PROBABILISTIC: max_mu, max_sigma = model.inference(feats, [feats.shape[1]]) if np.any(model_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * scaler.var_ max_mu = scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # Apply MLPG # (T, D_out) -> (T, static_dim) out = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features) else: # (T, D_out) out = max_mu.squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) else: out = model.inference( feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): out = multi_stream_mlpg( out, scaler.var_, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features) name = basename(in_feats.collected_files[idx][0]) out_path = join(out_dir, name) np.save(out_path, out, allow_pickle=False)
def eval_spss_model( step, netG, in_feats, out_feats, lengths, model_config, out_scaler, writer, sr, trajectory_smoothing=True, trajectory_smoothing_cutoff=50, ): # make sure to be in eval mode netG.eval() is_autoregressive = (netG.module.is_autoregressive() if isinstance( netG, nn.DataParallel) else netG.is_autoregressive()) prediction_type = (netG.module.prediction_type() if isinstance( netG, nn.DataParallel) else netG.prediction_type()) utt_indices = [-1, -2, -3] utt_indices = utt_indices[:min(3, len(in_feats))] if np.any(model_config.has_dynamic_features): static_stream_sizes = get_static_stream_sizes( model_config.stream_sizes, model_config.has_dynamic_features, model_config.num_windows, ) else: static_stream_sizes = model_config.stream_sizes for utt_idx in utt_indices: out_feats_denorm_ = out_scaler.inverse_transform( out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0)) mgc, lf0, vuv, bap = get_static_features( out_feats_denorm_, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, )[:4] mgc = mgc.squeeze(0).cpu().numpy() lf0 = lf0.squeeze(0).cpu().numpy() vuv = vuv.squeeze(0).cpu().numpy() bap = bap.squeeze(0).cpu().numpy() f0, spectrogram, aperiodicity = gen_world_params( mgc, lf0, vuv, bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) group = f"utt{np.abs(utt_idx)}_reference" wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav writer.add_audio(group, wav, step, sr) # Run forward if is_autoregressive: outs = netG( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]], out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), ) else: outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) # ResF0 case if isinstance(outs, tuple) and len(outs) == 2: outs, _ = outs if prediction_type == PredictionType.PROBABILISTIC: pi, sigma, mu = outs pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma, mu)[1] else: pred_out_feats = outs # NOTE: multiple outputs if isinstance(pred_out_feats, list): pred_out_feats = pred_out_feats[-1] if isinstance(pred_out_feats, tuple): pred_out_feats = pred_out_feats[0] if not isinstance(pred_out_feats, list): pred_out_feats = [pred_out_feats] # Run inference if prediction_type == PredictionType.PROBABILISTIC: inference_out_feats, _ = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) else: inference_out_feats = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) pred_out_feats.append(inference_out_feats) # Plot normalized input/output in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() fig, ax = plt.subplots(3, 1, figsize=(8, 8)) ax[0].set_title("Reference features") ax[1].set_title("Input features") ax[2].set_title("Predicted features") mesh = librosa.display.specshow(out_feats_.T, x_axis="frames", y_axis="frames", ax=ax[0], cmap="viridis") # NOTE: assuming normalized to N(0, 1) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[0]) mesh = librosa.display.specshow(in_feats_.T, x_axis="frames", y_axis="frames", ax=ax[1], cmap="viridis") mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[1]) mesh = librosa.display.specshow( inference_out_feats.squeeze(0).cpu().numpy().T, x_axis="frames", y_axis="frames", ax=ax[2], cmap="viridis", ) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[2]) for ax_ in ax: ax_.set_ylabel("Feature") plt.tight_layout() group = f"utt{np.abs(utt_idx)}_inference" writer.add_figure(f"{group}/Input-Output", fig, step) plt.close() assert len(pred_out_feats) == 2 for idx, pred_out_feats_ in enumerate(pred_out_feats): pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy() pred_out_feats_denorm = (out_scaler.inverse_transform( torch.from_numpy(pred_out_feats_).to( in_feats.device)).cpu().numpy()) if np.any(model_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_out_feats_denorm = multi_stream_mlpg( pred_out_feats_denorm, (out_scaler.scale_**2).cpu().numpy(), get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams( pred_out_feats_denorm, static_stream_sizes)[:4] # Remove high-frequency components of mgc/bap # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering if trajectory_smoothing: modfs = int(1 / 0.005) for d in range(pred_mgc.shape[1]): pred_mgc[:, d] = lowpass_filter( pred_mgc[:, d], modfs, cutoff=trajectory_smoothing_cutoff) for d in range(pred_bap.shape[1]): pred_bap[:, d] = lowpass_filter( pred_bap[:, d], modfs, cutoff=trajectory_smoothing_cutoff) # Generated sample f0, spectrogram, aperiodicity = gen_world_params( pred_mgc, pred_lf0, pred_vuv, pred_bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav if idx == 1: group = f"utt{np.abs(utt_idx)}_inference" else: group = f"utt{np.abs(utt_idx)}_forward" writer.add_audio(group, wav, step, sr) plot_spsvs_params( step, writer, mgc, lf0, vuv, bap, pred_mgc, pred_lf0, pred_vuv, pred_bap, group=group, sr=sr, )