def _compute_losses(self, model_output): mask = self.dataset.symb_mask # regression_outputs.shape = (batch_size, seq_length, regression_layer_size) regression_outputs = model_output # mixture_weights.shape : (batch_size, seq_len, n_gaussians) # means.shape : (batch_size, seq_len, n_gaussians, 3) # stds.shape : (batch_size, seq_len, n_gaussians, 3) mixture_weights, means, stds = self.model.get_mixture_parameters( regression_outputs, ndim=4) # targets.shape : (batch_size, seq_len, 1, 3) targets = self.dataset.symb_targets[:, :, None, :] log_prefix = -2 * T.log(mixture_weights) + self.d * np.float32( np.log(2 * np.pi)) + 2 * T.sum(T.log(stds), axis=-1) square_mahalanobis_dist = T.sum(T.square((targets - means) / stds), axis=-1) # loss_per_timestep.shape : (batch_size, seq_len) self.loss_per_time_step = -logsumexp( -0.5 * (log_prefix + square_mahalanobis_dist), axis=2) # loss_per_seq.shape : (batch_size,) # loss_per_seq is the log probability for each sequence self.loss_per_seq = T.sum(self.loss_per_time_step * mask, axis=1) if not self.sum_over_timestep: # loss_per_seq is the average log probability for each sequence self.loss_per_seq /= T.sum(mask, axis=1) return self.loss_per_seq
def _compute_losses(self, model_output): # model_output.shape : (batch_size, seq_len, K, M, target_size) # self.dataset.symb_targets.shape = (batch_size, seq_len+K-1, target_dims) # mask.shape : (batch_size, seq_len) or None mask = self.dataset.symb_mask # mu.shape = (batch_size, seq_len, K, M, target_dims) mu = model_output[:, :, :, :, 0:3] # sigma.shape = (batch_size, seq_len, K, M, target_dims) sigma = model_output[:, :, :, :, 3:6] # Stack K targets for each input (sliding window style) # targets.shape = (batch_size, seq_len, K, target_dims) targets = T.stack( [self.dataset.symb_targets[:, i : (-self.model.k + i + 1) or None] for i in range(self.model.k)], axis=2 ) # Add new axis for sum over M # targets.shape = (batch_size, seq_len, K, 1, target_dims) targets = targets[:, :, :, None, :] # For monitoring the L2 error of using $mu$ as the predicted direction (should be comparable to MICCAI's work). normalized_mu = mu[:, :, 0, 0] / l2distance(mu[:, :, 0, 0], keepdims=True, eps=1e-8) normalized_targets = targets[:, :, 0, 0] / l2distance(targets[:, :, 0, 0], keepdims=True, eps=1e-8) self.L2_error_per_item = T.sqrt(T.sum(((normalized_mu - normalized_targets) ** 2), axis=2)) if mask is not None: self.mean_sqr_error = T.sum(self.L2_error_per_item * mask, axis=1) / T.sum(mask, axis=1) else: self.mean_sqr_error = T.mean(self.L2_error_per_item, axis=1) # Likelihood of multivariate gaussian (n dimensions) is : # ((2 \pi)^D |\Sigma|)^{-1/2} exp(-1/2 (x - \mu)^T \Sigma^-1 (x - \mu)) # We suppose a diagonal covariance matrix, so we have : # => |\Sigma| = \prod_n \sigma_n^2 # => (x - \mu)^T \Sigma^-1 (x - \mu) = \sum_n ((x_n - \mu_n) / \sigma_n)^2 m_log_likelihoods = -np.float32((self.target_dims / 2.0) * np.log(2 * np.pi)) + T.sum( -T.log(sigma) - 0.5 * T.sqr((targets - mu) / sigma), axis=4 ) # k_losses_per_timestep.shape : (batch_size, seq_len, K) self.k_losses_per_timestep = T.log(self.m) - logsumexp(m_log_likelihoods, axis=3, keepdims=False) # loss_per_timestep.shape : (batch_size, seq_len) self.loss_per_time_step = T.mean(self.k_losses_per_timestep, axis=2) # Average over sequence steps. # k_nlls_per_seq.shape :(batch_size, K) if mask is not None: self.k_losses_per_seq = T.sum(self.k_losses_per_timestep * mask[:, :, None], axis=1) / T.sum( mask, axis=1, keepdims=True ) else: self.k_losses_per_seq = T.mean(self.k_losses_per_timestep, axis=1) # Average over K # loss_per_seq.shape :(batch_size,) self.loss_per_seq = T.mean(self.k_losses_per_seq, axis=1) return self.loss_per_seq
def _compute_losses(self, model_output): mask = self.dataset.symb_mask # regression_outputs.shape = (batch_size, seq_length, regression_layer_size) regression_outputs = model_output mixture_weights, means, stds = self.model.get_mixture_parameters(regression_outputs, ndim=4) # means.shape : (batch_size, seq_len, n_gaussians, 3) # mean_*.shape : (batch_size, seq_len, n_gaussians) mean_x = means[:, :, :, 0] mean_y = means[:, :, :, 1] mean_z = means[:, :, :, 2] # std_*.shape : (batch_size, seq_len, n_gaussians) std_x = stds[:, :, :, 0] std_y = stds[:, :, :, 1] std_z = stds[:, :, :, 2] # target_*.shape : (batch_size, seq_len, 1) target_x = self.dataset.symb_targets[:, :, 0, None] target_y = self.dataset.symb_targets[:, :, 1, None] target_z = self.dataset.symb_targets[:, :, 2, None] tg_x_c = (target_x - mean_x) / std_x tg_y_c = (target_y - mean_y) / std_y tg_z_c = (target_z - mean_z) / std_z log_prefix = T.log(mixture_weights) - np.float32((self.d / 2.) * np.log(2 * np.pi)) - T.log(std_x) - T.log(std_y) - T.log(std_z) square_mahalanobis_dist = -0.5 * (tg_x_c ** 2 + tg_y_c ** 2 + tg_z_c ** 2) # loss_per_timestep.shape : (batch_size, seq_len) self.loss_per_time_step = - logsumexp(log_prefix + square_mahalanobis_dist, axis=2) # loss_per_seq.shape : (batch_size,) self.loss_per_seq = T.sum(self.loss_per_time_step * mask, axis=1) / T.sum(mask, axis=1) return self.loss_per_seq
def _compute_losses(self, model_output): mask = self.dataset.symb_mask # stopping_criteria_outputs.shape : (batch_size, seq_len) stopping_criteria_outputs = model_output[0][:, :, 0] # regression_outputs.shape : (batch_size, seq_len, regression_layer_size) regression_outputs = model_output[1] # mixture_weights.shape : (batch_size, seq_len, n_gaussians) # means.shape : (batch_size, seq_len, n_gaussians, 3) # stds.shape : (batch_size, seq_len, n_gaussians, 3) mixture_weights, means, stds = self.model.get_mixture_parameters(regression_outputs, ndim=4) # targets.shape : (batch_size, seq_len, 1, 3) targets = self.dataset.symb_targets[:, :, None, :3] # stopping_criteria_targets.shape : (batch_size, seq_len) stopping_criteria_targets = self.dataset.symb_targets[:, :, 3] log_prefix = -2 * T.log(mixture_weights) + self.d * np.float32(np.log(2*np.pi)) + 2 * T.sum(T.log(stds), axis=-1) square_mahalanobis_dist = T.sum(T.square((targets - means) / stds), axis=-1) gaussian_mixture_nll_per_time_step = -logsumexp(-0.5 * (log_prefix + square_mahalanobis_dist), axis=2) stopping_cross_entropy_per_time_step = T.nnet.binary_crossentropy(stopping_criteria_outputs, stopping_criteria_targets) # loss_per_timestep.shape : (batch_size, seq_len) # self.gamma should be used to balance the two loss terms. Consider tweaking this hyperparameter if training goes wrong. self.loss_per_time_step = gaussian_mixture_nll_per_time_step + self.gamma * stopping_cross_entropy_per_time_step # loss_per_seq.shape : (batch_size,) # loss_per_seq is the log probability for each sequence self.loss_per_seq = T.sum(self.loss_per_time_step * mask, axis=1) if not self.sum_over_timestep: # loss_per_seq is the average log probability for each sequence self.loss_per_seq /= T.sum(mask, axis=1) return self.loss_per_seq
def _compute_losses(self, model_output): # model_output.shape : (batch_size, seq_len, K, M, target_size) # self.dataset.symb_targets.shape = (batch_size, seq_len+K-1, target_dims) # mask.shape : (batch_size, seq_len) or None mask = self.dataset.symb_mask # mu.shape = (batch_size, seq_len, K, M, target_dims) mu = model_output[:, :, :, :, 0:3] # sigma.shape = (batch_size, seq_len, K, M, target_dims) sigma = model_output[:, :, :, :, 3:6] # Stack K targets for each input (sliding window style) # targets.shape = (batch_size, seq_len, K, target_dims) targets = T.stack([ self.dataset.symb_targets[:, i:(-self.model.k + i + 1) or None] for i in range(self.model.k) ], axis=2) # Add new axis for sum over M # targets.shape = (batch_size, seq_len, K, 1, target_dims) targets = targets[:, :, :, None, :] # For monitoring the L2 error of using $mu$ as the predicted direction (should be comparable to MICCAI's work). normalized_mu = mu[:, :, 0, 0] / l2distance( mu[:, :, 0, 0], keepdims=True, eps=1e-8) normalized_targets = targets[:, :, 0, 0] / l2distance( targets[:, :, 0, 0], keepdims=True, eps=1e-8) self.L2_error_per_item = T.sqrt( T.sum(((normalized_mu - normalized_targets)**2), axis=2)) if mask is not None: self.mean_sqr_error = T.sum(self.L2_error_per_item * mask, axis=1) / T.sum(mask, axis=1) else: self.mean_sqr_error = T.mean(self.L2_error_per_item, axis=1) # Likelihood of multivariate gaussian (n dimensions) is : # ((2 \pi)^D |\Sigma|)^{-1/2} exp(-1/2 (x - \mu)^T \Sigma^-1 (x - \mu)) # We suppose a diagonal covariance matrix, so we have : # => |\Sigma| = \prod_n \sigma_n^2 # => (x - \mu)^T \Sigma^-1 (x - \mu) = \sum_n ((x_n - \mu_n) / \sigma_n)^2 m_log_likelihoods = -np.float32( (self.target_dims / 2.) * np.log(2 * np.pi)) + T.sum( -T.log(sigma) - 0.5 * T.sqr((targets - mu) / sigma), axis=4) # k_losses_per_timestep.shape : (batch_size, seq_len, K) self.k_losses_per_timestep = T.log(self.m) - logsumexp( m_log_likelihoods, axis=3, keepdims=False) # loss_per_timestep.shape : (batch_size, seq_len) self.loss_per_time_step = T.mean(self.k_losses_per_timestep, axis=2) # Average over sequence steps. # k_nlls_per_seq.shape :(batch_size, K) if mask is not None: self.k_losses_per_seq = T.sum( self.k_losses_per_timestep * mask[:, :, None], axis=1) / T.sum( mask, axis=1, keepdims=True) else: self.k_losses_per_seq = T.mean(self.k_losses_per_timestep, axis=1) # Average over K # loss_per_seq.shape :(batch_size,) self.loss_per_seq = T.mean(self.k_losses_per_seq, axis=1) return self.loss_per_seq