def sample_trajectories(self, initial_points: tf.Tensor, sim_steps: int) -> tf.Tensor: """ Simulate trajectories starting at initial points (one trajectory for each initial point) for 'sim_steps' simulation steps :param initial_points: a Matrix (nb_different_initial_points x dimension) :param sim_steps: number of simulation steps :return: SDE trajectories, one for each different initial point """ # assert initial_points.ndim == 2, "initial_points is not a matrix" diffusions = tfp.distributions.MultivariateNormalDiag( tf.zeros(initial_points.shape, dtype=tf_floatx()), tf.sqrt(self.diffusion.expected_diffusion()), ).sample(sim_steps, dtype=tf_floatx()) trajectories = tf.scan(lambda x_tm1, noise_term: x_tm1 + self. drift_svgp.predict_f(x_tm1)[0] + noise_term, elems=diffusions, initializer=initial_points, name='sde_sim') # transpose so that output has shape (initial_points.shape[0], sim_steps + 1, input_dim) return tf.concat([ initial_points[:, tf.newaxis, :], tf.transpose(trajectories, [1, 0, 2]) ], axis=1)
def _encode_and_decode(self, y_input, training=None, initial_state=None, use_mask=False): """ :param y_input: :param training: :param initial_state: (encoder_initial_state, initial_dynamic_mean, initial_dynamic_prec) :return: """ encoder_initial_state, x0_stats = self._unzip_initial_state( initial_state) encoded_means, encoded_scales, encoder_states = self.encoder( y_input, training=training, initial_state=encoder_initial_state) initial_dynamic_mean, initial_dynamic_scale = self._handle_x0_stats( x0_stats, (encoded_means, encoded_scales)) # encoded_covs = tfd.MultivariateNormalDiag(scale_diag=encoded_scales).covariance() samples, entropies, sampling_distro, final_mean, final_prec = ( self.mpa.forward_backward( # encoded_means, encoded_covs, initial_dynamic_mean, tf.map_fn(lambda x: tf.linalg.diag(1 / x ** 2), initial_dynamic_scale) (initial_dynamic_mean, initial_dynamic_scale), (encoded_means, encoded_scales))) decoded_means, decoded_scales_diag = tf.map_fn( lambda x_: self.decoder(x_, training=training), samples, dtype=(tf_floatx(), tf_floatx())) return ((samples, entropies, sampling_distro), (encoded_means, encoded_scales), (decoded_means, decoded_scales_diag), (encoder_states, final_mean, final_prec))
def __init__(self, kernel: gpflow.kernels.SeparateIndependent, inducing_points: tf.Tensor, num_latent: int, prior_scale=1.0): self.nb_inducing_variables = int(inducing_points.shape[0]) inducing_variables = gpflow.inducing_variables.SharedIndependentInducingVariables( gpflow.inducing_variables.InducingPoints( tf.Variable(inducing_points, name='inducing_points.py'))) super().__init__(kernel, gpflow.likelihoods.Gaussian(), inducing_variable=inducing_variables, num_latent_gps=num_latent) self._q_mu_0 = self.q_mu self._q_sqrt_0 = self.q_sqrt self._vague_prior = np.any(np.isinf(prior_scale)) if not self._vague_prior: self.prior_distribution = tfd.MultivariateNormalDiag( tf.zeros_like(tf.transpose(self.q_mu), dtype=tf_floatx()), tf.repeat(tf.convert_to_tensor(prior_scale, dtype=tf_floatx())[..., tf.newaxis], self.q_mu.shape[0], axis=-1))
def __init__(self, diff_parameters: DiffParameters): super(Diffusion, self).__init__() def get_bijector(): if gpflow.config.default_positive_bijector() == 'exp': return tfp.bijectors.Exp() elif gpflow.config.default_positive_bijector() == 'softplus': return tfp.bijectors.Softplus() else: raise ValueError( "Unexpected value in default_positive_bijector()") assert len(diff_parameters.alphas) == len( diff_parameters.betas), "len(alphas) != len(betas)" self.dimension = len(diff_parameters.alphas) alphas = diff_parameters.alphas betas = diff_parameters.betas self._alphas = gpflow.Parameter( tf.ones_like(alphas, dtype=tf_floatx()), # alphas, transform=get_bijector(), name='alphas') self._betas = gpflow.Parameter( # TODO tf.ones_like(betas, dtype=tf_floatx()), # betas, transform=get_bijector(), name='betas') self.prior_distribution = tfd.Gamma(alphas, betas)
def __init__(self, initial_learning_rate, maximum_learning_rate, growth_steps, midpoint, name=None): super(SigmoidScheduler, self).__init__() self.initial_learning_rate = tf.convert_to_tensor(initial_learning_rate, dtype=tf_floatx()) self.maximum_learning_rate = tf.convert_to_tensor(maximum_learning_rate, dtype=tf_floatx()) self.delta = self.maximum_learning_rate - self.initial_learning_rate self.growth_steps = tf.convert_to_tensor(growth_steps, dtype=tf_floatx()) self.midpoint = tf.convert_to_tensor(midpoint, dtype=tf_floatx()) self.name = name
def draw_fast_samples(vae, initial_state, x_chunk): rnn_state, mean0, scale0 = vae._handle_x0_state(initial_state, x_chunk) mean, scales, states = vae.encoder(x_chunk, training=True, initial_state=rnn_state) distro = tfd.MultivariateNormalDiag(mean, scales) samples = distro.sample(vae.mpa.nb_samples) covs = tfp.stats.covariance(tf.reshape(samples, (*samples.shape[:2], -1)), sample_axis=0) # TODO covs = covs + 1e-8 * tf.eye( covs.shape[1], batch_shape=[covs.shape[0]], dtype=tf_floatx()) entropies = 0.5 * covs.shape[1] * (1 + tf.math.log(2 * tf.constant( np.pi, dtype=tf_floatx()))) + 0.5 * tf.linalg.logdet(covs) return samples, entropies, (mean, scales), (mean0, scale0), states
def automatic_sde_nat_grads(self, y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps=None, kl_weight=tf.convert_to_tensor( 1.0, dtype=tf_floatx())): alphas = self.sde_model.diffusion._alphas betas = self.sde_model.diffusion._betas q_mu = self.sde_model.drift_svgp.q_mu q_sqrt = self.sde_model.drift_svgp.q_sqrt vars = [ alphas.unconstrained_variable, betas.unconstrained_variable, q_mu.unconstrained_variable, q_sqrt.unconstrained_variable ] with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape: tape.watch(vars) expectations = self.sde_model.expectation_params() xis = self.sde_model.expectation_to_xi(expectations) loss = self._loss(y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight) dL_dxi = tape.gradient(loss, vars) # Apply chain rule to get the natural gradients natural_gradients = tape.gradient(xis, expectations, output_gradients=dL_dxi) del tape return natural_gradients
def optimize_sde_standard_grad(y_input, y_target, gm: VAE, optimizer, initial_state=None, effective_nb_timesteps=None, kl_weight=tf.convert_to_tensor( 1.0, dtype=tf_floatx()), clip_value=100.): vvars = gm.sde_model.variational_variables (samples, entropies, _), encoded_dist, decoded_dist, final_state = gm._encode_and_decode( y_input, training=False, initial_state=initial_state) with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape: tape.watch(vvars) breaked_loss = gm._breaked_loss(y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight) loss = tf.reduce_sum(breaked_loss) # loss = gm.loss(y_input, y_target, training=True) vgrads = tape.gradient(loss, vvars) optimizer.apply_gradients(zip(vgrads, vvars)) return loss, breaked_loss, final_state
def kullback_leibler_by_dimension(self, free_bits=tf.convert_to_tensor( 0, dtype=tf_floatx())): # TODO do not permit not whiten representations! if self.drift_svgp._vague_prior: # The kullback-leibler when using a vague prior, up to constant (theoretically, infinite) q_sqrt = self.drift_svgp.q_sqrt gaussian_kl = -0.5 * q_sqrt.shape[-1] - tf.reduce_sum( tf.math.log(tf.linalg.diag_part(q_sqrt)), 1) # We don't apply the free bits technique when using the vague prior, since we already have # infinite free bits in the gaussian_kl! kl = (gaussian_kl + tfd.Gamma( self.diffusion.alphas(), self.diffusion.betas()).kl_divergence( self.diffusion.prior_distribution)) else: kl_mu = tf.transpose( tf.sqrt(self.diffusion.expected_precision())[tf.newaxis, ...] * self.drift_svgp.q_mu) q_sqrt = self.drift_svgp.q_sqrt kl = (tfd.MultivariateNormalTriL(kl_mu, q_sqrt).kl_divergence( self.drift_svgp.prior_distribution) + tfd.Gamma(self.diffusion.alphas(), self.diffusion.betas()).kl_divergence( self.diffusion.prior_distribution)) kl = tf.math.maximum(free_bits, kl) return kl
def __init__(self, initial_learning_rate, maximum_learning_rate, growth_steps, name=None): super(NatGradScheduler, self).__init__() self.initial_learning_rate = tf.convert_to_tensor(initial_learning_rate, dtype=tf_floatx()) self.maximum_learning_rate = tf.convert_to_tensor(maximum_learning_rate, dtype=tf_floatx()) self.growth_steps = tf.convert_to_tensor(growth_steps, dtype=tf_floatx()) self.name = name self._log_initial_learning_rate = tf.math.log(self.initial_learning_rate) self._log_maximum_learning_rate = tf.math.log(self.maximum_learning_rate)
def tf_pairwise_distance(feature, squared: bool = False): """Computes the pairwise distance matrix with numerical stability. output[i, j] = || feature[i, :] - feature[j, :] ||_2 Args: feature: 2-D Tensor of size [number of data, feature dimension]. squared: Boolean, whether or not to square the pairwise distances. Returns: pairwise_distances: 2-D Tensor of size [number of data, number of data]. """ pairwise_distances_squared = tf.math.add( tf.math.reduce_sum(tf.math.square(feature), axis=[1], keepdims=True), tf.math.reduce_sum( tf.math.square(tf.transpose(feature)), axis=[0], keepdims=True ), ) - 2.0 * tf.matmul(feature, tf.transpose(feature)) # Deal with numerical inaccuracies. Set small negatives to zero. pairwise_distances_squared = tf.math.maximum(pairwise_distances_squared, 0.0) # Get the mask where the zero distances are at. error_mask = tf.math.less_equal(pairwise_distances_squared, 0.0) # Optionally take the sqrt. if squared: pairwise_distances = pairwise_distances_squared else: pairwise_distances = tf.math.sqrt( pairwise_distances_squared + tf.cast(error_mask, dtype=tf.dtypes.float32) * 1e-16 ) # Undo conditionally adding 1e-16. pairwise_distances = tf.math.multiply( pairwise_distances, tf.cast(tf.math.logical_not(error_mask), dtype=tf_floatx()), ) num_data = tf.shape(feature)[0] # Explicitly set diagonals to zero. mask_offdiagonals = tf.ones_like(pairwise_distances) - tf.linalg.diag( tf.ones([num_data], dtype=tf_floatx()) ) pairwise_distances = tf.math.multiply(pairwise_distances, mask_offdiagonals) return pairwise_distances
def forward_step(particles, log_weights, encoding_potential): particles = self.proposal_builder(particles, encoding_potential).sample() log_weights = self.weight_fn(log_weights, particles, encoding_potential) log_weights = log_weights - tf.math.reduce_logsumexp( log_weights, axis=-1, keepdims=True) # Compute effective sample size and entropy of weighting vector. # These are useful statistics for adaptive particle filtering. weights = tf.exp(log_weights) n_eff = 1.0 / tf.reduce_sum(tf.square(weights), axis=1) # resampling (systematic resampling) step particles, log_weights = tf.map_fn( lambda x: tf.cond( x[0] < self.n_eff_threshold, lambda: forward_resampling( x[1], x[2], self.alpha), lambda: (x[1], x[2])), # Transpose to iterate through batch dimension elems=(n_eff, particles, log_weights), dtype=(tf_floatx(), tf_floatx())) # Transpose so that dimensions denote [particles, batch, ...] return particles, log_weights
def synthetize(self, y_input, y_target, simulation_steps): samples, encoded_dist, decoded_dist, loss, states = self.__call__( y_input, y_target, training=False, initial_state=None, effective_nb_timesteps=tf.convert_to_tensor(1.0, dtype=tf_floatx())) # Use last samples as starting point. We use just the first sample from each batch initial_points = samples[0, :, -1, :] predicted_samples = self.sde_model.sample_trajectories( initial_points, simulation_steps) return self.decoder(predicted_samples), predicted_samples
def _parse_dataset(self, folder, scaling_function, batch_size, shuff_buffer_size=0): """creates a batched Dataset object. If shuff_buffer_size > 0, shuffling is used""" files = glob.glob(os.path.join(folder, "*.tfrecords")) nb_files = len(files) if nb_files >= batch_size: tfdata = tf.data.TFRecordDataset(files) # parse the protobuff to a dictionary with the 'signal' feature parsed_dataset = tfdata.map( lambda proto: tf.io.parse_single_example(proto, { 'signal': tf.io.FixedLenFeature([self.example_timesteps], tf.float32, default_value=np.zeros(self.example_timesteps)) }) ) # Extract the 'signal' feature from the dictionary to get a tensor representing the signal as # a vector of length (len_tbptt, ). We add an extra dimension for consistency with the expected # input of RNNs, in which the latest dimension is the dimension of the feature space. parsed_dataset = parsed_dataset.map( lambda x: x['signal'][..., tf.newaxis], num_parallel_calls=tf.data.experimental.AUTOTUNE ) if tf_floatx() != tf.float32: parsed_dataset = parsed_dataset.map( lambda x: tf.cast(x, tf_floatx()), num_parallel_calls=tf.data.experimental.AUTOTUNE ) if scaling_function is not None: parsed_dataset = parsed_dataset.map( lambda x: scaling_function(x), num_parallel_calls=tf.data.experimental.AUTOTUNE ) parsed_dataset = parsed_dataset.cache() #TODO if shuff_buffer_size > 0: parsed_dataset = parsed_dataset.shuffle(shuff_buffer_size) parsed_dataset = parsed_dataset.batch(batch_size, drop_remainder=True) parsed_dataset = parsed_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # TODO else: warnings.warn(f"Insufficient number of files in {folder} to create batches of size {batch_size} (returning None)") parsed_dataset = None return parsed_dataset
def __init__(self, input_dimension: int, output_dimension: int, len_tbptt: int, encoder_type, encoder_hidden_units, encoder_kernel_size, encoder_dilation_rate, phase_space_dimension: int, drift_parameters: DriftParameters, diff_parameters: DiffParameters, pseudo_inputs: np.array, nb_samples: int, initial_prec=INITIAL_PREC): # TODO:initial_prec self.phase_space_dim = phase_space_dimension if encoder_type == 'rnn': self.encoder = RnnEncoder(encoder_hidden_units, output_dim=phase_space_dimension) # TODO # elif encoder_type == 'cnn': # self.encoder = Encoder(encoder_hidden_units, kernel_size=encoder_kernel_size, # dilation_rate=encoder_dilation_rate, output_dim=phase_space_dimension) else: raise ValueError('Invalid encoder type') self.decoder = Decoder(output_dimension) self.sde_model = SdeModel( tf.convert_to_tensor(pseudo_inputs, dtype=tf_floatx()), drift_parameters, diff_parameters) # Using -Inf in case you use a improper prior self.minimum_nats = tf.convert_to_tensor(-np.Inf, dtype=tf_floatx()) # Since we are using, TBPTT, we permit the gm to have a 'state', in the sense that the last means and precs # of the current chunk can be propagated to the next chunk as the initial_means and precs. # We first set the values for state_0 (t=0) and then create two variables for tracking the state self.x0_prior = tfd.MultivariateNormalDiag( scale_diag=tf.ones(self.phase_space_dim, dtype=tf_floatx())) # TODO: permit to modify the filtering method without changing this self.mpa = VaeleParticleFilter(self.sde_model, nb_samples) assert nb_samples > 1, 'nb_samples should be > 1'
def __init__(self, proposal_builder, dynamics_fn, weight_fn, nb_samples, n_particles=100, n_eff_threshold=None): self.nb_samples = nb_samples self.n_particles = n_particles self.n_eff_threshold = n_eff_threshold or n_particles // 2 self.proposal_builder = proposal_builder self.dynamics_fn = dynamics_fn self.weight_fn = weight_fn self.alpha = tf.constant(0.99, dtype=tf_floatx())
def _init_filter(self, init_stats, init_potentials, batch_size): # Product of diagonal gaussians mean_x, scale_x = init_stats mean_y, scale_y = init_potentials if mean_x is None or scale_x is None: mean_out, scale_out = mean_y, scale_y else: mean_out, scale_out = mean_x, scale_x # var_xy = 1 / (1 / var_x + 1 / var_y) # mean_xy = ((mean_y / var_y) + (mean_x / var_x)) * var_xy distro = tfd.MultivariateNormalDiag(mean_out, scale_out) return (tf.transpose(distro.sample(self.n_particles), [1, 0, 2]), tf.ones((batch_size, self.n_particles), dtype=tf_floatx()) / self.n_particles)
def _loss(self, y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight=tf.convert_to_tensor(1.0, dtype=tf_floatx())): return tf.reduce_sum( self._breaked_loss(y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight))
def __call__(self, y_input, y_target, training=None, initial_state=None, effective_nb_timesteps=None, kl_weight=tf.convert_to_tensor(1.0, dtype=tf_floatx())): (samples, entropies, _), encoded_dist, decoded_dist, states = self._encode_and_decode( y_input, training=training, initial_state=initial_state) loss = self._loss(y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight) return samples, encoded_dist, decoded_dist, loss, states
def _forward_step(self, message_tm1, encoding_potentials_t): """ :param message_tm1: The filtered means (N x D), covariances and precisions (N x D x D) of x_tm1|y_1:tm1. N is due to the batchs; D is the dimension of the embedding space. :param encoding_potentials_t: The gaussian potentials relating x_t with y_t. Again, there is a batched dimension. :return: the filtered distribution x_t|y_1:t (means, covariances and precisions), the predicted distribution x_t|y_1:tm1, and the covariance matrices cov(x_t, x_tm1| y_1:tm1). """ # Ignore the predicted distribution and the conditional covariance from previous time step (means_tm1, covs_tm1, precs_tm1), _, _ = message_tm1 encoding_means_t, encoding_precs_t = encoding_potentials_t # The means_tm1 is N x D (N is due to batches). Transform each vector (D, ) into a # row vector (1, D) by adding a new axis expanded_means_tm1 = means_tm1[:, tf.newaxis, :] means_t_given_tm1, covs_t_t_given_tm1, precs_t_t_given_tm1, covs_t_tm1_given_tm1 = ( tf.map_fn( lambda x: self.predict_xt_given_tm1(mean_tm1=x[0], cov_tm1=x[1]), elems=(expanded_means_tm1, covs_tm1), # predict_xt_given_tm1 returns # mean_t_given_tm1, cov_t_t_given_tm1, tf.linalg.inv(cov_t_t_given_tm1), cov_t_tm1_given_tm1 # Hence: dtype=(tf_floatx(), tf_floatx(), tf_floatx(), tf_floatx()), name='forward_predict_map' ) ) # TODO: there is an squeeze here. Sometimes we expand, sometimes we squeeze. Unify approach # to avoid unnecessary operations means_t_given_tm1 = tf.squeeze(means_t_given_tm1, axis=1) return ( multiply_gaussians(means_t_given_tm1, precs_t_t_given_tm1, encoding_means_t, encoding_precs_t), (means_t_given_tm1, covs_t_t_given_tm1, precs_t_t_given_tm1), covs_t_tm1_given_tm1 )
def get_breaked_loss(y_input, y_target, gm, gamma=1.0, initial_state=None, effective_nb_timesteps=None, kl_weight=tf.convert_to_tensor(1.0, dtype=tf_floatx()), clip_value=None): sde_nat_grads, breaked_loss, loss, final_state = gm.nat_grads( y_input, y_target, training=False, initial_state=initial_state, effective_nb_timesteps=effective_nb_timesteps, kl_weight=kl_weight) return loss, breaked_loss, final_state
def loss(self, y_input, y_target, training=None, initial_state=None, effective_nb_of_timesteps=None, kl_weight=tf.convert_to_tensor(1.0, dtype=tf_floatx())): if effective_nb_of_timesteps is None: effective_nb_of_timesteps = y_target.shape[1] return self.__call__(y_input, y_target, training=training, initial_state=initial_state, effective_nb_timesteps=effective_nb_of_timesteps, kl_weight=kl_weight)[-2:]
def nat_grads(self, y_input, y_target, training, initial_state, effective_nb_timesteps, kl_weight=tf.convert_to_tensor(1.0, dtype=tf_floatx())): (samples, entropies, _), encoded_dist, decoded_dist, states = self._encode_and_decode( y_input, training=training, initial_state=initial_state) breaked_loss = self._breaked_loss(y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight) loss = tf.reduce_sum(breaked_loss) natgrads = self.sde_nat_grads(samples, effective_nb_timesteps) return natgrads, breaked_loss, loss, states
def test_loss(self, epoch, kl_scheduler): if not self.experiment.has_test: return 0 avg_loss = tf.keras.metrics.Mean(name='loss', dtype=tf_floatx()) kl_weight = kl_scheduler(epoch) for y in self.experiment.test_dataset: batch_loss = 0 initial_state = None for x_chunk, y_chunk in self.tbptt_chunks_generator(y): chunk_loss, initial_state = self.model.loss( x_chunk, y_chunk, training=False, initial_state=initial_state, effective_nb_of_timesteps=self.experiment.effective_len, kl_weight=kl_weight) batch_loss += chunk_loss avg_loss.update_state(batch_loss) return avg_loss.result()
def _compute_filtering_gammas(self, mean_tm1, cov_tm1, sqe_terms: _SqeAuxTems): """ Deisenroth, 23 and 24 Definition just after 4.29 of the PhD thesis """ det_term = tf.map_fn( lambda x: 1 / tf.sqrt(tf.linalg.det(x)), ( tf.einsum('ij,ajk->aik', cov_tm1, sqe_terms.inv_Lambdas) + tf.expand_dims(tf.eye(self.sde.dimension, dtype=tf_floatx()), 0) ) ) det_term = tf.expand_dims(det_term, 1) zeta = mean_tm1 - self.sde.iv_values() exp_term = tf.exp(-0.5 * tf.reduce_sum( tf.tensordot(zeta, tf.linalg.inv(cov_tm1 + sqe_terms.Lambdas), [[1], [1]]) * tf.expand_dims(zeta, 1), axis=2 )) return sqe_terms.variances * det_term * tf.transpose(exp_term)
def __init__(self, units_list, output_dim, bidirectional=BIDIRECTIONAL, use_scale_network=False, tie_scale=False): super().__init__() # TODO conv_units_list = [32] # [32, 64] self.tie_scale = tie_scale self.use_scale_network = use_scale_network self.kernel_size = 0 self.dilation_rate = 0 self.output_dim = output_dim # TODO: tie scale not used embedding_dim = output_dim self.embedding = DynEmbedding(units_list, embedding_dim, bidirectional, None) with tf.name_scope("output"): self.mean_output_layer = tf.keras.layers.TimeDistributed( Mlp([128], output_dim, dropout=0.0)) if self.use_scale_network: # Note the dependency of the output_dim on tie_scale self.scale_output_layer = tf.keras.layers.TimeDistributed( Mlp([128], 1 if tie_scale else output_dim, dropout=0.0, activation='softplus')) else: transform = gpflow.utilities.positive() self.scale = gpflow.Parameter( 0.1 * tf.ones(1 if tie_scale else output_dim, dtype=tf_floatx()), transform=transform, name='encoder_scale')
def tbptt_chunks_generator(data_, len_tbptt, time_lag, kernel_size, dilation_rate, noise_std=0, do_bursts=True): target, data = build_delay_space(data_, 5, time_lag) prediction_lag = 0 len_tbptt = min(data.shape[1], len_tbptt) do_bursts = False # Length that can be used to generate both the lagged version and the target signal from # the original data nb_drop = (kernel_size - 1) * dilation_rate nb_chunks = int( np.floor((data.shape[1] - max(prediction_lag + nb_drop, nb_drop)) / len_tbptt)) if nb_chunks < 1: raise RuntimeError('Cannot generate chunks in tbptt') for i in range(nb_chunks): # Add extra samples to handle that the convolution removes the nb_drop lattest ones inputs = data[:, (i * len_tbptt):((i + 1) * len_tbptt + nb_drop), :] # FIXME # output = tf.concat([ # data[:, (i * len_tbptt + nb_drop):((i + 1) * len_tbptt + nb_drop), :], # data[:, (i * len_tbptt + prediction_lag + nb_drop):((i + 1) * len_tbptt + prediction_lag + nb_drop), :], # ], axis=-1) # output = output[:, nb_drop:(-nb_drop), :] output = target[:, (i * len_tbptt):((i + 1) * len_tbptt + nb_drop), :] # TODO: noise if do_bursts: inputs = add_bursts(inputs) elif noise_std > 0: inputs = inputs + tf.random.normal( inputs.shape, stddev=noise_std, dtype=tf_floatx()) yield inputs, output
def optimize_sde_with_nat_grad(y_input, y_target, gm, gamma=1.0, initial_state=None, effective_nb_timesteps=None, kl_weight=tf.convert_to_tensor( 1.0, dtype=tf_floatx()), clip_value=None): sde_nat_grads, breaked_loss, loss, final_state = gm.nat_grads( y_input, y_target, training=False, initial_state=initial_state, effective_nb_timesteps=effective_nb_timesteps, kl_weight=kl_weight) if clip_value: sde_nat_grads = [ tf.clip_by_value(nat_grad, -clip_value, clip_value) for nat_grad in sde_nat_grads ] thetas = SdeModel.standard_to_natural_params([ gm.sde_model.diffusion._alphas, gm.sde_model.diffusion._betas, gm.sde_model.drift_svgp.q_mu, gm.sde_model.drift_svgp.q_sqrt ]) new_xis = SdeModel.natural_to_standard_params( (thetas[0] - gamma * sde_nat_grads[0], thetas[1] - gamma * sde_nat_grads[1], thetas[2] - gamma * sde_nat_grads[2], thetas[3] - gamma * sde_nat_grads[3])) gm.sde_model.diffusion._alphas.assign(new_xis[0]) gm.sde_model.diffusion._betas.assign(new_xis[1]) gm.sde_model.drift_svgp.q_mu.assign(new_xis[2]) gm.sde_model.drift_svgp.q_sqrt.assign(new_xis[3]) return loss, breaked_loss, final_state
def optimize_nnets_and_hpars(y_input, y_target, optimizer, gm: VAE, initial_state=None, effective_nb_timesteps=None, kl_weight=tf.convert_to_tensor( 1.0, dtype=tf_floatx())): vars = (gm.encoder.trainable_variables + list(gm.decoder.trainable_variables) + list(gm.sde_model.hyperpars)) with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape: tape.watch(vars) (samples, entropies, sampling_dist ), encoded_dist, decoded_dist, final_state = gm._encode_and_decode( y_input, training=True, initial_state=initial_state) bloss = gm._breaked_loss(y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight) loss = tf.reduce_sum(bloss) grads = tape.gradient(loss, vars) optimizer.apply_gradients(zip(grads, vars)) return loss, final_state
def _breaked_loss(self, y_input, y_target, samples, entropies, encoded_dist, decoded_dist, initial_state, effective_nb_timesteps, kl_weight=tf.convert_to_tensor(1.0, dtype=tf_floatx())): # This implements equation (4.17) from the thesis (A part of which is detailed in Eq. (4.18)) _, x0_stats = self._unzip_initial_state(initial_state) x0_mean, x0_scale = self._handle_x0_stats(x0_stats, encoded_dist) (decoded_means, decoded_scales_diag) = decoded_dist # Reduce all the entropies (one per batch) taking into account the effective_nb_timesteps reduced_entropy = (effective_nb_timesteps / y_target.shape[1]) * tf.reduce_mean(entropies) ly = self._loglikelihood_y_given_x(y_target, decoded_means, decoded_scales_diag, effective_nb_timesteps) lx, mpenalty, alphaterm, [lxs, lx0s] = self._variational_loglikelihood_x( samples, x0_mean, x0_scale, effective_nb_timesteps) kl = self.sde_model.kullback_leibler(self.minimum_nats) # Note the minus so that this is a loss (instead of lower bound!) # TODO: kl is 0 return tf.stack([ -ly, -kl_weight * lx, -kl_weight * mpenalty, -kl_weight * alphaterm, -kl_weight * reduced_entropy, kl_weight * kl ])