def test_real_metrics(): _, source = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(source) lengths = [len(x) for x in X] X = X.asarray() mgc = X[:, :, :source.mgc_dim // 3] lf0 = X[:, :, source.lf0_start_idx] vuv = (X[:, :, source.vuv_start_idx] > 0).astype(np.int) bap = X[:, :, source.bap_start_idx] mgc_tgt = mgc + 0.01 lf0_tgt = lf0 + 0.01 vuv_tgt = vuv.copy() bap_tgt = bap + 0.01 mcd = metrics.melcd(mgc, mgc_tgt, lengths) bap_mcd = metrics.melcd(bap, bap_tgt, lengths) lf0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_tgt, vuv_tgt, lengths) vuv_err = metrics.vuv_error(vuv, vuv_tgt) assert mcd > 0 assert bap_mcd > 0 assert lf0_mse > 0 assert vuv_err == 0.0
def compute_distortions(y_static, y_hat_static, Y_data_mean, Y_data_std, lengths=None): if hp.name == "acoustic": mgc, lf0, vuv, bap = split_streams(y_static, Y_data_mean, Y_data_std) mgc_hat, lf0_hat, vuv_hat, bap_hat = split_streams( y_hat_static, Y_data_mean, Y_data_std) try: f0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_hat, vuv_hat, lengths=lengths, linear_domain=True) except ZeroDivisionError: f0_mse = np.nan distortions = { "mcd": metrics.melcd(mgc[:, :, 1:], mgc_hat[:, :, 1:], lengths=lengths), "bap_mcd": metrics.melcd(bap, bap_hat, lengths=lengths) / 10.0, "f0_rmse": np.sqrt(f0_mse), "vuv_err": metrics.vuv_error(vuv, vuv_hat, lengths=lengths), } elif hp.name == "duration": y_static_invscale = P.inv_scale(y_static, Y_data_mean, Y_data_std) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean, Y_data_std) distortions = { "dur_rmse": math.sqrt( metrics.mean_squared_error(y_static_invscale, y_hat_static_invscale, lengths=lengths)) } elif hp.name == "vc": static_dim = hp.order y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) distortions = { "mcd": metrics.melcd(y_static_invscale, y_hat_static_invscale, lengths=lengths) } else: assert False return distortions
def test_f0_mse(): np.random.seed(1234) T = 100 x = np.random.rand(T, 1) y = x.copy() x_vuv = np.hstack((np.zeros(2), np.ones(T - 2))) y_vuv = np.hstack((np.ones(T - 2), np.zeros(2))) assert metrics.lf0_mean_squared_error(x, x_vuv, y, y_vuv) == 0 assert metrics.lf0_mean_squared_error(x, x_vuv, y, y_vuv) == 0 # batch x1 = np.random.rand(32, T, 1) y1 = np.random.rand(32, T, 1) x1_vuv = np.tile(x_vuv, (32, 1)) y1_vuv = np.tile(x_vuv, (32, 1)) x2 = torch.rand(32, T, 1) y2 = torch.rand(32, T, 1) x2_vuv = torch.from_numpy(x1_vuv).clone() y2_vuv = torch.from_numpy(y1_vuv).clone() f = metrics.lf0_mean_squared_error for linear_domain in [True, False]: for x, x_vuv, y, y_vuv in [(x1, x1_vuv, y1, y1_vuv), (x2, x2_vuv, y2, y2_vuv)]: lengths = [x.shape[1]] * len(x) np.testing.assert_almost_equal(f(x, x_vuv, y, y_vuv, lengths, linear_domain=linear_domain), f(x, x_vuv, y, y_vuv, linear_domain=linear_domain), decimal=5) assert f(x, x_vuv, y, y_vuv, linear_domain=linear_domain) > 0
def compute_distortions(pred_out_feats, out_feats, lengths, out_scaler, model_config): """Compute distortion measures between predicted and ground-truth acoustic features Args: pred_out_feats (nn.Tensor): predicted acoustic features out_feats (nn.Tensor): ground-truth acoustic features lengths (nn.Tensor): lengths of the sequences out_scaler (nn.Module): scaler to denormalize features model_config (dict): model configuration Returns: dict: a dict that includes MCD for mgc/bap, V/UV error and F0 RMSE """ out_feats = out_scaler.inverse_transform(out_feats) pred_out_feats = out_scaler.inverse_transform(pred_out_feats) out_streams = get_static_features( out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) pred_out_streams = get_static_features( pred_out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) assert len(out_streams) >= 4 mgc, lf0, vuv, bap = out_streams[0], out_streams[1], out_streams[ 2], out_streams[3] pred_mgc, pred_lf0, pred_vuv, pred_bap = ( pred_out_streams[0], pred_out_streams[1], pred_out_streams[2], pred_out_streams[3], ) # binarize vuv vuv, pred_vuv = (vuv > 0.5).float(), (pred_vuv > 0.5).float() dist = { "ObjEval_MGC_MCD": metrics.melcd(mgc[:, :, 1:], pred_mgc[:, :, 1:], lengths=lengths), "ObjEval_BAP_MCD": metrics.melcd(bap, pred_bap, lengths=lengths) / 10.0, "ObjEval_VUV_ERR": metrics.vuv_error(vuv, pred_vuv, lengths=lengths), } try: f0_mse = metrics.lf0_mean_squared_error(lf0, vuv, pred_lf0, pred_vuv, lengths=lengths, linear_domain=True) dist["ObjEval_F0_RMSE"] = np.sqrt(f0_mse) except ZeroDivisionError: pass return dist