def gen_parameters(y_predicted, Y_mean, Y_std): mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = hp_acoustic.windows # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) ty = "acoustic" # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, Y_mean[ty][:mgc_dim // len(windows)], Y_std[ty][:mgc_dim // len(windows)]) lf0 = P.inv_scale( lf0, Y_mean[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], Y_std[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale( bap, Y_mean[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)], Y_std[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, Y_mean[ty][vuv_start_idx], Y_std[ty][vuv_start_idx]) return mgc, lf0, vuv, bap
def test_mlpg(): from nnmnkwii import paramgen as G static_dim = 2 T = 10 windows_set = _get_windows_set() for windows in windows_set: means = np.random.rand(T, static_dim * len(windows)) variances = np.tile(np.random.rand(static_dim * len(windows)), (T, 1)) generated = G.mlpg(means, variances, windows) assert generated.shape == (T, static_dim) # Test variances correctly expanded for windows in windows_set: for dtype in [np.float32, np.float64]: means = np.random.rand(T, static_dim * len(windows)).astype(dtype) variances = np.random.rand(static_dim * len(windows)).astype(dtype) variances_frames = np.tile(variances, (T, 1)) # Explicitly give variances over frame generated1 = G.mlpg(means, variances_frames, windows) # Give global variances. This will get expanded over frames # internally generated2 = G.mlpg(means, variances, windows) assert generated1.dtype == dtype assert np.allclose(generated1, generated2)
def test_functional_mlpg(): static_dim = 2 T = 5 for windows in _get_windows_set(): torch.manual_seed(1234) means = torch.rand(T, static_dim * len(windows)) variances = torch.ones(static_dim * len(windows)) y = G.mlpg(means.numpy(), variances.numpy(), windows) y = Variable(torch.from_numpy(y), requires_grad=False) means = Variable(means, requires_grad=True) # mlpg y_hat = AF.mlpg(means, variances, windows) assert np.allclose(y.data.numpy(), y_hat.data.numpy()) # Test backward pass nn.MSELoss()(y_hat, y).backward() # unit_variance_mlpg R = torch.from_numpy(G.unit_variance_mlpg_matrix(windows, T)) y_hat = AF.unit_variance_mlpg(R, means) assert np.allclose(y.data.numpy(), y_hat.data.numpy()) nn.MSELoss()(y_hat, y).backward() # Test 3D tensor inputs y_hat = AF.unit_variance_mlpg(R, means.view(1, -1, means.size(-1))) assert np.allclose( y.data.numpy(), y_hat.data.view(-1, static_dim).numpy()) nn.MSELoss()(y_hat.view(-1, static_dim), y).backward()
def multi_stream_mlpg( inputs, variances, windows, stream_sizes=None, has_dynamic_features=None, streams=None, ): """Split streams and do apply MLPG if stream has dynamic features Args: inputs (array like): input 3-d or 2-d array variances (array like): variances of input features windows (list): windows for parameter generation stream_sizes (list): stream sizes has_dynamic_features (list): binary flags that indicates if steams have dynamic features streams (list, optional): Streams of interests. Returns all streams if streams is None. Defaults to None. Raises: RuntimeError: if stream sizes are wrong Returns: array like: generated static features """ if stream_sizes is None: stream_sizes = [180, 3, 1, 3] if has_dynamic_features is None: has_dynamic_features = [True, True, False, True] if streams is None: streams = [True] * len(stream_sizes) T, D = inputs.shape if D != sum(stream_sizes): raise RuntimeError("You probably have specified wrong dimension params.") # Straem indices for static+delta features # [0, 180, 183, 184] start_indices = np.hstack(([0], np.cumsum(stream_sizes)[:-1])) # [180, 183, 184, 199] end_indices = np.cumsum(stream_sizes) ret = [] for in_start_idx, in_end_idx, v, enabled in zip( start_indices, end_indices, has_dynamic_features, streams, ): if not enabled: continue x = inputs[:, in_start_idx:in_end_idx] if inputs.shape == variances.shape: var_ = variances[:, in_start_idx:in_end_idx] else: var_ = np.tile(variances[in_start_idx:in_end_idx], (T, 1)) y = paramgen.mlpg(x, var_, windows) if v else x ret.append(y) return np.concatenate(ret, -1)
def gen_parameters(y_predicted): # Number of time frames T = y_predicted.shape[0] # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG #Y_acoustic_std.var_ mgc_variances = np.tile(Y_acoustic_std.var_[:lf0_start_idx], (T, 1)) #mgc_variances = np.tile(Y_var[ty][:lf0_start_idx], (T, 1)) mgc = paramgen.mlpg(mgc, mgc_variances, windows) lf0_variances = np.tile(Y_acoustic_std.var_[lf0_start_idx:vuv_start_idx], (T, 1)) lf0 = paramgen.mlpg(lf0, lf0_variances, windows) bap_variances = np.tile(Y_acoustic_std.var_[bap_start_idx:], (T, 1)) bap = paramgen.mlpg(bap, bap_variances, windows) return mgc, lf0, vuv, bap
def gen_parameters(self, utt_id, labels): feature = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features='coarse_coding').astype(np.float32) # normalize feature = scaler['X']['acoustic'].transform(feature) # add speaker information feature = self.add_speaker_code(utt_id, feature) # predict acoustic features feature = torch.from_numpy(feature).to(device) pred = self.acoustic_model.predict(feature) pred_mean = pred['mean'].data.cpu().numpy() pred_var = pred['var'].data.cpu().numpy() # denormalize scale = self.scaler['Y']['acoustic'].scale_ pred_mean = self.scaler['Y']['acoustic'].inverse_transform(pred_mean) pred_var *= scale ** 2 # split acoustic features mgc = pred_mean[:, :self.lf0_start_idx] lf0 = pred_mean[:, self.lf0_start_idx:self.vuv_start_idx] vuv = pred_mean[:, self.vuv_start_idx] bap = pred_mean[:, self.bap_start_idx:] # make variances for Maximum Likelihood Parameter Generation (MLPG) mgc_variances = pred_var[:, :self.lf0_start_idx] lf0_variances = pred_var[:, self.lf0_start_idx:self.vuv_start_idx] bap_variances = pred_var[:, self.bap_start_idx:] # perform MLPG to calculate static features mgc = mlpg(mgc, mgc_variances, self.windows) lf0 = mlpg(lf0, lf0_variances, self.windows) bap = mlpg(bap, bap_variances, self.windows) feature = np.hstack([mgc, lf0, vuv.reshape(-1, 1), bap]) return feature
def _generate_parameters(self, path, var): seq = self.parameter_generator.generate(path) seq = trim_zeros_frames(seq) T = seq.shape[0] feat_index = self.feature_config.get_indices() mgc = seq[:, :feat_index['lf0']] lf0 = seq[:, feat_index['lf0']:feat_index['vuv']] vuv = seq[:, feat_index['vuv']] bap = seq[:, feat_index['bap']:] mgc_var = np.tile(var[:feat_index['lf0']], (T, 1)) lf0_var = np.tile(var[feat_index['lf0']:feat_index['vuv']], (T, 1)) bap_var = np.tile(var[feat_index['bap']:], (T, 1)) mgc = paramgen.mlpg(mgc, mgc_var, self.analysis_config.window) lf0 = paramgen.mlpg(lf0, lf0_var, self.analysis_config.window) bap = paramgen.mlpg(bap, bap_var, self.analysis_config.window) return mgc, lf0, vuv, bap
def test_unit_variance_mlpg(): static_dim = 2 T = 10 for windows in _get_windows_set(): means = np.random.rand(T, static_dim * len(windows)) variances = np.ones(static_dim * len(windows)) y = G.mlpg(means, variances, windows) R = G.unit_variance_mlpg_matrix(windows, T) y_hat = R.dot(G.reshape_means(means, static_dim)) assert np.allclose(y_hat, y)
def gen_parameters(y_predicted, Y_var): # Number of time frames T = y_predicted.shape[0] # Split acoustic features mgc = y_predicted[:, :hp.lf0_start_idx] lf0 = y_predicted[:, hp.lf0_start_idx:hp.vuv_start_idx] vuv = y_predicted[:, hp.vuv_start_idx] bap = y_predicted[:, hp.bap_start_idx:] # Perform MLPG ty = "acoustic" mgc_variances = np.tile(Y_var[ty][:hp.lf0_start_idx], (T, 1)) mgc = paramgen.mlpg(mgc, mgc_variances, windows) lf0_variances = np.tile(Y_var[ty][hp.lf0_start_idx:hp.vuv_start_idx], (T, 1)) lf0 = paramgen.mlpg(lf0, lf0_variances, windows) bap_variances = np.tile(Y_var[ty][hp.bap_start_idx:], (T, 1)) bap = paramgen.mlpg(bap, bap_variances, windows) return mgc, lf0, vuv, bap
def forward(self, means): assert means.dim() == 2 # we cannot do MLPG on minibatch variances = self.variances self.save_for_backward(means) T, D = means.size() assert means.size() == variances.size() means_np = means.detach().numpy() variances_np = variances.detach().numpy() y = G.mlpg(means_np, variances_np, self.windows) y = torch.from_numpy(y.astype(np.float32)) return y
def gen_parameters(y_predicted, verbose=True): # Number of time frames T = y_predicted.shape[0] # Split acoustic features mgc = y_predicted[:,:lf0_start_idx] lf0 = y_predicted[:,lf0_start_idx:vuv_start_idx] #lf0 = Y['acoustic']['train'][90][:, lf0_start_idx:vuv_start_idx] #lf0 = np.zeros(lf0.shape) vuv = y_predicted[:,vuv_start_idx] bap = y_predicted[:,bap_start_idx:] # Perform MLPG ty = "acoustic" mgc_variances = np.tile(y_stats['var'][:lf0_start_idx], (T, 1))#np.tile(np.ones(Y_var[ty][:lf0_start_idx].shape), (T, 1))# mgc = paramgen.mlpg(mgc, mgc_variances, windows) lf0_variances = np.tile(y_stats['var'][lf0_start_idx:vuv_start_idx], (T,1))#np.tile(np.ones(Y_var[ty][lf0_start_idx:vuv_start_idx].shape), (T,1))# lf0 = paramgen.mlpg(lf0, lf0_variances, windows) bap_variances = np.tile(y_stats['var'][bap_start_idx:], (T, 1))#np.tile(np.ones(Y_var[ty][bap_start_idx:].shape), (T, 1))# bap = paramgen.mlpg(bap, bap_variances, windows) return mgc, lf0, vuv, bap
def multi_stream_mlpg(inputs, variances, windows, stream_sizes=[180, 3, 1, 3], has_dynamic_features=[True, True, False, True], streams=[True, True, True, True]): """Split streams and do apply MLPG if stream has dynamic features. """ T, D = inputs.shape if D != sum(stream_sizes): raise RuntimeError( "You probably have specified wrong dimention params.") num_windows = len(windows) # Straem indices for static+delta features # [0, 180, 183, 184] start_indices = np.hstack(([0], np.cumsum(stream_sizes)[:-1])) # [180, 183, 184, 199] end_indices = np.cumsum(stream_sizes) # Stream sizes for static features # [60, 1, 1, 5] static_stream_sizes = get_static_stream_sizes(stream_sizes, has_dynamic_features, num_windows) # [0, 60, 61, 62] static_stream_start_indices = np.hstack( ([0], np.cumsum(static_stream_sizes)[:-1])) # [60, 61, 62, 63] static_stream_end_indices = np.cumsum(static_stream_sizes) ret = [] for in_start_idx, in_end_idx, out_start_idx, out_end_idx, v, enabled in zip( start_indices, end_indices, static_stream_start_indices, static_stream_end_indices, has_dynamic_features, streams): if not enabled: continue x = inputs[:, in_start_idx:in_end_idx] if inputs.shape == variances.shape: var_ = variances[:, in_start_idx:in_end_idx] else: var_ = np.tile(variances[in_start_idx:in_end_idx], (T, 1)) y = paramgen.mlpg(x, var_, windows) if v else x ret.append(y) return np.concatenate(ret, -1)
def gen_parameters(self, y_predicted, mge_training=False): mgc_dim, lf0_dim, vuv_dim, bap_dim = stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim # MGE training if mge_training: # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) #import pdb; pdb.set_trace() # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, self.mean[:mgc_dim // len(windows)], self.std[:mgc_dim // len(windows)]) lf0 = P.inv_scale( lf0, self.mean[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], self.std[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale( bap, self.mean[bap_start_idx:bap_start_idx + bap_dim // len(windows)], self.std[bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, self.mean[vuv_start_idx], self.std[vuv_start_idx]) else: # Denormalization first y_predicted = P.inv_scale(y_predicted, self.mean, self.std) # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG Y_var = self.std * self.std mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows) lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows) bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows) return mgc, lf0, vuv, bap
def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True): mgc_dim, lf0_dim, vuv_dim, bap_dim = audio_world_config.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = audio_world_config.windows #ty = "acoustic" # MGE training if mge_training: # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, Y_mean[:mgc_dim // len(windows)], Y_std[:mgc_dim // len(windows)]) lf0 = P.inv_scale( lf0, Y_mean[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], Y_std[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale( bap, Y_mean[bap_start_idx:bap_start_idx + bap_dim // len(windows)], Y_std[bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, Y_mean[vuv_start_idx], Y_std[vuv_start_idx]) else: # Denormalization first y_predicted = P.inv_scale(y_predicted, Y_mean, Y_std) # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG Y_var = Y_std * Y_std mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows) lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows) bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows) return mgc, lf0, vuv, bap
def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True): mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = hp_acoustic.windows ty = "acoustic" # MGE training if mge_training: # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, Y_mean[ty][:mgc_dim // len(windows)], Y_std[ty][:mgc_dim // len(windows)]) lf0 = P.inv_scale(lf0, Y_mean[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], Y_std[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale(bap, Y_mean[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)], Y_std[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, Y_mean[ty][vuv_start_idx], Y_std[ty][vuv_start_idx]) else: # Denormalization first y_predicted = P.inv_scale(y_predicted, Y_mean, Y_std) # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG Y_var = Y_std[ty] * Y_std[ty] mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows) lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows) bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows) return mgc, lf0, vuv, bap
def transform(self, src): """Mapping source feature x to target feature y so that maximize the likelihood of y given x. Args: src (array): shape (`the number of frames`, `the order of spectral feature`) a sequence of source speaker's spectral feature that will be transformed. Returns: array: a sequence of transformed features """ T, feature_dim = src.shape[0], src.shape[1] if feature_dim == self.static_dim: return super(MLPG, self).transform(src) # A suboptimum mixture sequence (eq.37) optimum_mix = self.px.predict(src) # Compute E eq.(40) E = np.empty((T, feature_dim)) for t in range(T): m = optimum_mix[t] # estimated mixture index at time t xx = np.linalg.solve(self.covarXX[m], src[t] - self.src_means[m]) # Eq. (22) E[t] = self.tgt_means[m] + np.dot(self.covarYX[m], xx) # Compute D eq.(23) # Approximated variances with diagonals so that we can do MLPG # efficiently in dimention-wise manner D = np.empty((T, feature_dim)) for t in range(T): m = optimum_mix[t] # Eq. (23), with approximating covariances as diagonals D[t] = np.diag(self.covarYY[m]) - np.diag(self.covarYX[m]) / \ np.diag(self.covarXX[m]) * np.diag(self.covarXY[m]) # Once we have mean and variance over frames, then we can do MLPG return mlpg(E, D, self.windows)
def test_minibatch_unit_variance_mlpg_gradcheck(): static_dim = 2 T = 5 for windows in _get_windows_set(): batch_size = 5 torch.manual_seed(1234) # Prepare inputs means = torch.rand(T, static_dim * len(windows)) means_expanded = means.expand( batch_size, means.shape[0], means.shape[1]) reshaped_means = torch.from_numpy( G.reshape_means(means.numpy(), static_dim)) reshaped_means_expanded = reshaped_means.expand( batch_size, reshaped_means.shape[0], reshaped_means.shape[1]) # Target y = G.mlpg(means.numpy(), np.ones(static_dim * len(windows)), windows) y = Variable(torch.from_numpy(y), requires_grad=False) y_expanded = y.expand(batch_size, y.size(0), y.size(1)) # Pack into variables means = Variable(means, requires_grad=True) means_expanded = Variable(means_expanded, requires_grad=True) reshaped_means = Variable(reshaped_means, requires_grad=True) reshaped_means_expanded = Variable( reshaped_means_expanded, requires_grad=True) # Case 1: 2d with reshaped means R = torch.from_numpy(G.unit_variance_mlpg_matrix(windows, T)) y_hat1 = AF.unit_variance_mlpg(R, reshaped_means) # Case 2: 3d with reshaped means y_hat2 = AF.unit_variance_mlpg(R, reshaped_means_expanded) for i in range(batch_size): assert np.allclose(y_hat1.data.numpy(), y_hat2[i].data.numpy()) nn.MSELoss()(y_hat1, y).backward() nn.MSELoss()(y_hat2, y_expanded).backward() # Check grad consistency for i in range(batch_size): grad1 = reshaped_means.grad.data.numpy() grad2 = reshaped_means_expanded.grad[i].data.numpy() assert np.allclose(grad1, grad2) # Case 3: 2d with non-reshaped input y_hat3 = AF.unit_variance_mlpg(R, means) # Case 4: 3d with non-reshaped input y_hat4 = AF.unit_variance_mlpg(R, means_expanded) for i in range(batch_size): assert np.allclose(y_hat1.data.numpy(), y_hat3.data.numpy()) assert np.allclose(y_hat3.data.numpy(), y_hat4[i].data.numpy()) nn.MSELoss()(y_hat3, y).backward() nn.MSELoss()(y_hat4, y_expanded).backward() # Check grad consistency for i in range(batch_size): grad1 = means.grad.data.numpy() grad2 = means_expanded.grad[i].data.numpy() assert np.allclose(grad1, grad2)
### Get action mean, var = policy(inputs, length.tolist()) m = mean.detach().to('cpu').numpy() v = var.detach().to('cpu').numpy() dm = librosa.feature.delta(m, width=9, order=1, axis=1) ddm = librosa.feature.delta(m, width=9, order=2, axis=1) dv = librosa.feature.delta(v, width=9, order=1, axis=1) dv = 2 * v + dv dv = np.where(dv <= 0, 1e-10, dv) ddv = librosa.feature.delta(dv, width=9, order=1, axis=1) ddv = 2 * dv + ddv ddv = np.where(ddv <= 0, 1e-10, ddv) m = np.concatenate((m, dm, ddm), axis=2) v = np.concatenate((v, dv, ddv), axis=2) action = G.mlpg(m[0], v[0], windows) action = torch.from_numpy(np.asarray(action, dtype=np.float32)).to(device) action = utils.trans_param(action).unsqueeze(dim=0) ### Store parameters tractParams = action[0, :length[0], :24].reshape(1, length[0], 24) glottisParams = action[0, :length[0], -6:].reshape(1, length[0], 6) t_param = tractParams.to('cpu') t_tmp = torch.zeros(1, 5, 24) t_tmp[:, 2, :] = t_param[:, 0, :] / 3 t_tmp[:, 3, :] = t_param[:, 0, :] * 2 / 3 t_tmp[:, 4, :] = t_param[:, 0, :] t_param = torch.cat((t_tmp, t_param), dim=1) for i in range(5): t_param = torch.cat((t_param, t_param[:, -1, :].unsqueeze(dim=1)), dim=1)
tCC=[] trMSE=[] tCC_MLPG=[] trMSE_MLPG=[] tCC_KM=[] trMSE_KM=[] tCC_LPF=[] trMSE_LPF=[] for i in np.arange(0,len(Youttest)): s_in=X_testseq[i] #s_in=s_in[np.newaxis,:,0:inputDim] val=np.squeeze(model.predict(s_in)); predSeq_wom[0,i]=val Dvar=np.tile(np.var(val,axis=0),(val.shape[0],1)) predSeq_wm[0,i]=mlpg(val, Dvar, windows) #MLPG k_smth = kalmansmooth(val.transpose()).transpose() # Kalaman Filtering predSeq_kf[0,i]=k_smth #InSeq[0,i]=s_in yLPF = filtfilt(fb, fa, val.transpose()).transpose() predSeq_lpf[0,i]=yLPF YtestOrg[0,i]=Youttest[i] iCC,irMSE=EvalMetric(val,np.squeeze(Youttest[i])) tCC.append(iCC) trMSE.append(irMSE) iCC,irMSE=EvalMetric(mlpg(val, Dvar, windows),np.squeeze(Youttest[i]))
tCC=[] trMSE=[] tCC_MLPG=[] trMSE_MLPG=[] tCC_KM=[] trMSE_KM=[] tCC_LPF=[] trMSE_LPF=[] paramgen = GMM_M(gmm, windows=windows) # Inherit the GMM class for i in np.arange(0,len(Youttest)): s_in=np.squeeze(X_testseq[i]) #s_in=s_in[np.newaxis,:,0:inputDim] #val=model.predict(s_in); val, D, W=paramgen.transform(s_in) # val: Conditional Expectation; D: Conditional Variance; W: windows predSeq_wom[0,i]=val predSeq_wm[0,i]=mlpg(val, D, W) #MLPG k_smth = kalmansmooth(val.transpose()).transpose() # Kalaman Filtering predSeq_kf[0,i]=k_smth #InSeq[0,i]=s_in yLPF = filtfilt(fb, fa, val.transpose()).transpose() predSeq_lpf[0,i]=yLPF YtestOrg[0,i]=Youttest[i] iCC,irMSE=EvalMetric(val,np.squeeze(Youttest[i])) tCC.append(iCC) trMSE.append(irMSE) iCC,irMSE=EvalMetric(mlpg(val, D, W),np.squeeze(Youttest[i])) tCC_MLPG.append(iCC) trMSE_MLPG.append(irMSE)
def benchmark_mlpg(static_dim=59, T=100, batch_size=10, use_cuda=True): if use_cuda and not torch.cuda.is_available(): return windows = _get_windows_set()[-1] np.random.seed(1234) torch.manual_seed(1234) means = np.random.rand(T, static_dim * len(windows)).astype(np.float32) variances = np.ones(static_dim * len(windows)) reshaped_means = G.reshape_means(means, static_dim) # Ppseud target y = G.mlpg(means, variances, windows).astype(np.float32) # Pack into variables means = Variable(torch.from_numpy(means), requires_grad=True) reshaped_means = Variable(torch.from_numpy(reshaped_means), requires_grad=True) y = Variable(torch.from_numpy(y), requires_grad=False) criterion = nn.MSELoss() # Case 1: MLPG since = time.time() for _ in range(batch_size): y_hat = AF.mlpg(means, torch.from_numpy(variances), windows) L = criterion(y_hat, y) assert np.allclose(y_hat.data.numpy(), y.data.numpy()) L.backward() # slow! elapsed_mlpg = time.time() - since # Case 2: UnitVarianceMLPG since = time.time() if use_cuda: y = y.cuda() R = G.unit_variance_mlpg_matrix(windows, T) R = torch.from_numpy(R) # Assuming minibatch are zero-ppaded, we only need to create MLPG matrix # per-minibatch, not per-utterance. if use_cuda: R = R.cuda() for _ in range(batch_size): if use_cuda: means = means.cpu() means = means.cuda() y_hat = AF.unit_variance_mlpg(R, means) L = criterion(y_hat, y) assert np.allclose(y_hat.cpu().data.numpy(), y.cpu().data.numpy(), atol=1e-5) L.backward() elapsed_unit_variance_mlpg = time.time() - since ratio = elapsed_mlpg / elapsed_unit_variance_mlpg print( "MLPG vs UnitVarianceMLPG (static_dim, T, batch_size, use_cuda) = ({}):" .format((static_dim, T, batch_size, use_cuda))) if ratio > 1: s = "faster" sys.stdout.write(OKGREEN) else: s = "slower" sys.stdout.write(FAIL) print( "UnitVarianceMLPG, {:4f} times {}. Elapsed times {:4f} / {:4f}".format( ratio, s, elapsed_mlpg, elapsed_unit_variance_mlpg)) print(ENDC)
rmse = 0 with torch.no_grad(): policy.eval() mean, var = policy(inputs, length) m = mean.detach().to('cpu').numpy() v = var.detach().to('cpu').numpy() dm = librosa.feature.delta(m, width=9, order=1, axis=1) ddm = librosa.feature.delta(m, width=9, order=2, axis=1) dv = librosa.feature.delta(v, width=9, order=1, axis=1) dv = 2 * v + dv dv = np.where(dv <= 0, 1e-10, dv) ddv = librosa.feature.delta(dv, width=9, order=1, axis=1) ddv = 2 * dv + ddv ddv = np.where(ddv <= 0, 1e-10, ddv) m = np.concatenate((m, dm, ddm), axis=2) v = np.concatenate((v, dv, ddv), axis=2) action = np.zeros((target.shape[0], length[0], OUT_SIZE)) for i in range(target.shape[0]): action[i] = G.mlpg(m[i], v[i], windows) action = torch.from_numpy(np.asarray(action, dtype=np.float32)).to(device) action = torch.clamp(action, min=0.0, max=1.0)[:, :, :24] loss = F.mse_loss(action, target, reduction='none') loss = loss.mean(dim=1) #print(torch.sqrt(loss.mean(dim=1))) #print(torch.sqrt(loss.mean()).item()) act_dist = torch.sqrt(loss.mean(dim=0)).to('cpu').unsqueeze(dim=0).numpy() with open('log/eval.csv', 'a') as f: np.savetxt(f, act_dist, delimiter=',')
tCC = [] trMSE = [] tCC_MLPG = [] trMSE_MLPG = [] tCC_KM = [] trMSE_KM = [] tCC_LPF = [] trMSE_LPF = [] for i in np.arange(0, len(Youttest)): s_in = X_testseq[i] #s_in=s_in[np.newaxis,:,0:inputDim] val = np.squeeze(model.predict(s_in)) predSeq_wom[0, i] = val Dvar = np.tile(np.var(val, axis=0), (val.shape[0], 1)) predSeq_wm[0, i] = mlpg(val, Dvar, windows) #MLPG k_smth = kalmansmooth(val.transpose()).transpose() # Kalaman Filtering predSeq_kf[0, i] = k_smth #InSeq[0,i]=s_in yLPF = filtfilt(fb, fa, val.transpose()).transpose() predSeq_lpf[0, i] = yLPF YtestOrg[0, i] = Youttest[i] iCC, irMSE = EvalMetric(val, np.squeeze(Youttest[i])) tCC.append(iCC) trMSE.append(irMSE) iCC, irMSE = EvalMetric(mlpg(val, Dvar, windows), np.squeeze(Youttest[i]))