def update_inference(self): self.save_params() if self.bayes: pqz = Normal(loc=0., scale=self.prior_std) self.inference.latent_vars[self.scope] = {pqz: self.qz} for w, b, qw, qb in zip(self.prev_eW, self.prev_eB, self.eW, self.eB): self.inference.latent_vars[self.scope].update({w: qw, b: qb}) for w, b, qw, qb in zip(self.prev_dW, self.prev_dB, self.dW, self.dB): self.inference.latent_vars[self.scope].update({w: qw, b: qb}) self.inference.latent_vars[self.scope].update({ self.prev_sigma_w: self.sigma_w, self.prev_sigma_b: self.sigma_b }) else: pqz = Normal(loc=self.prev_eH[-1], scale=tf.maximum(self.prev_z_sigma, 1e-4)) self.inference.latent_vars[self.scope] = {pqz: self.qz} self.inference.reinitialize(self)
def prepare_loss(self, global_step): self.global_step = global_step print( "Preparing loss {}".format(self.id) ) self.state_value_batch = self.critic_batch # [Policy distribution] old_policy_distributions = [] new_policy_distributions = [] policy_loss_builder = [] for h,policy_head in enumerate(self.policy_heads): if is_continuous_control(policy_head['depth']): # Old policy old_policy_batch = tf.transpose(self.old_policy_batch[h], [1, 0, 2]) old_policy_distributions.append( Normal(old_policy_batch[0], old_policy_batch[1]) ) # New policy new_policy_batch = tf.transpose(self.actor_batch[h], [1, 0, 2]) new_policy_distributions.append( Normal(new_policy_batch[0], new_policy_batch[1]) ) else: # discrete control old_policy_distributions.append( Categorical(self.old_policy_batch[h]) ) # Old policy new_policy_distributions.append( Categorical(self.actor_batch[h]) ) # New policy builder = self._get_policy_loss_builder(new_policy_distributions[h], old_policy_distributions[h], self.old_action_batch[h], self.old_action_mask_batch[h] if self.has_masked_actions else None) policy_loss_builder.append(builder) # [Actor loss] self.policy_loss = sum(self._get_policy_loss(b) for b in policy_loss_builder) # [Debug variables] self.policy_kl_divergence = sum(b.approximate_kullback_leibler_divergence() for b in policy_loss_builder) self.policy_clipping_frequency = sum(b.get_clipping_frequency() for b in policy_loss_builder)/len(policy_loss_builder) # take average because clipping frequency must be in [0,1] self.policy_entropy_regularization = sum(b.get_entropy_regularization() for b in policy_loss_builder) # [Critic loss] value_loss_builder = self._get_value_loss_builder() self.value_loss = self._get_value_loss(value_loss_builder) # [Entropy regularization] if flags.entropy_regularization: self.policy_loss += -self.policy_entropy_regularization # [Constraining Replay] if self.constrain_replay: constrain_loss = sum( 0.5*builder.reduce_function(tf.squared_difference(new_distribution.mean(), tf.stop_gradient(old_action))) for builder, new_distribution, old_action in zip(policy_loss_builder, new_policy_distributions, self.old_action_batch) ) self.policy_loss += tf.cond( pred=self.is_replayed_batch[0], true_fn=lambda: constrain_loss, false_fn=lambda: tf.constant(0., dtype=self.parameters_type) ) # [Total loss] self.total_loss = self.policy_loss + self.value_loss if flags.intrinsic_reward: self.total_loss += self.intrinsic_reward_loss
def prepare_loss(self): with tf.device(self.device): print(" [{}]Preparing loss".format(self.id)) # [Policy distribution] if self.is_continuous_control(): # Old policy old_policy_batch = tf.transpose(self.old_policy_batch, [1, 0, 2]) old_policy_distributions = Normal(old_policy_batch[0], old_policy_batch[1]) # New policy new_policy_batch = tf.transpose(self.policy_batch, [1, 0, 2]) new_policy_distributions = Normal(new_policy_batch[0], new_policy_batch[1]) else: # discrete control old_policy_distributions = Categorical( self.old_policy_batch) # Old policy new_policy_distributions = Categorical( self.policy_batch) # New policy # [Actor loss] policy_loss_builder = PolicyLoss( cliprange=self.clip, cross_entropy=new_policy_distributions.cross_entropy( self.old_action_batch), old_cross_entropy=old_policy_distributions.cross_entropy( self.old_action_batch), advantage=self.advantage_batch, # entropy=self.fentropy, entropy=new_policy_distributions.entropy(), beta=self.beta) self.policy_loss = policy_loss_builder.get() # [Critic loss] value_loss_builder = ValueLoss(cliprange=self.clip, value=self.value_batch, old_value=self.old_value_batch, reward=self.cumulative_reward_batch) self.value_loss = flags.value_coefficient * value_loss_builder.get( ) # usually critic has lower learning rate # [Extra loss] self.extra_loss = tf.constant(0.) if self.predict_reward: self.extra_loss += self._reward_prediction_loss() # [Debug variables] self.policy_kl_divergence = policy_loss_builder.approximate_kullback_leibler_divergence( ) self.policy_clipping_frequency = policy_loss_builder.get_clipping_frequency( ) self.policy_entropy_contribution = policy_loss_builder.get_entropy_contribution( ) self.total_loss = self.policy_loss + self.value_loss + self.extra_loss
def gen_stein_coreset(init,core_y_data,qW,qB,n_samples,ac_fn,conv_W=None,LR=False,noise_std=0.001,sess=None): stein_core_x = tf.get_variable('stein_cx',initializer=init.astype(np.float32),dtype=tf.float32) print('gen stein coreset') if LR: stein_core_y = Normal(loc=tf.matmul(stein_core_x,qW)+qB,scale=noise_std) elif conv_W is not None: ## to do: change to general function ## h = forward_cifar_model(stein_core_x,conv_W) stein_core_y = forward_nets(qW,qB,h,ac_fn=ac_fn,bayes=True,num_samples=10)[-1] else: stein_core_y = forward_nets(qW,qB,stein_core_x,ac_fn=ac_fn,bayes=True,num_samples=10)[-1] lnp = tf.reduce_mean(stein_core_y.log_prob(core_y_data),axis=0) dlnp = tf.gradients(lnp,stein_core_x) svgd = SVGD() #print('shape check',stein_core_x.shape) core_sgrad = svgd.gradients(stein_core_x,dlnp[0]) return stein_core_x,stein_core_y,core_sgrad
def get_feature_entropy( self, input, scope, name="" ): # feature entropy measures how much the input is uncommon with tf.device(self.device): batch_norm, _ = self._batch_norm_layer(input=input, scope=scope, name=name, share_trainables=False) fentropy = Normal( batch_norm.moving_mean, tf.sqrt(batch_norm.moving_variance)).cross_entropy(input) fentropy = tf.layers.flatten(fentropy) if len(fentropy.get_shape()) > 1: fentropy = tf.reduce_mean(fentropy, axis=-1) return fentropy
def reward(i): global node_types sigma_val = { 'V1': 5, 'V2': 10, 'V3': 20, 'V4': 40, 'G1': 100, 'G2': 120, 'G3': 140, 'G4': 160, 'G5': 180 } return Normal(mu=0, sigma=sigma_val[node_types[i]])
def create_env_structure(leaves): sigma_val = {'V1': 5, 'V2': 10, 'V3': 20, 'V4': 40, 'G1': 100, 'G2': 120, 'G3': 140, 'G4': 160, 'G5': 180} goal_structure = ['G1', 'G2', 'G3', 'G4', 'G5'] x = 17 TREE = [[(i*x)+1 for i in range(leaves)]] dist = ['V1'] for i in range(leaves): TREE += [[(i*x)+2, (i*x)+8, (i*x)+13], [(i*x)+3, (i*x)+7], [(i*x)+4], [], [(i*x)+4], [(i*x)+4], [(i*x)+5, (i*x)+6], [(i*x)+9, (i*x)+12], [(i*x)+4], [(i*x)+4], [(i*x)+4], [(i*x)+10, (i*x)+11], [(i*x)+14, (i*x)+17], [(i*x)+4], [(i*x)+4], [(i*x)+4], [(i*x)+15, (i*x)+16]] g = goal_structure.pop(0) goal_structure.append(g) dist += ['V1', 'V2', 'V3', g, 'V4', 'V4', 'V3', 'V2', 'V3', 'V4', 'V4', 'V3', 'V2', 'V3', 'V4', 'V4', 'V3'] INIT = tuple([Normal(mu=0, sigma=sigma_val[d]) for d in dist]) return TREE, INIT
def sample_actions(self): with tf.device(self.device): if self.is_continuous_control(): new_policy_batch = tf.transpose(self.policy_batch, [1, 0, 2]) sample_batch = Normal(new_policy_batch[0], new_policy_batch[1]).sample() action_batch = tf.clip_by_value( sample_batch, -1, 1 ) # Sample action batch in forward direction, use old action in backward direction else: # discrete control action_batch = Categorical(self.policy_batch).sample( ) # Sample action batch in forward direction, use old action in backward direction # Give self esplicative name to output for easily retrieving it in frozen graph tf.identity(action_batch, name="action") return action_batch
def sample_actions(self): action_batch = [] hot_action_batch = [] for h,actor_head in enumerate(self.actor_batch): if is_continuous_control(self.policy_heads[h]['depth']): new_policy_batch = tf.transpose(actor_head, [1, 0, 2]) sample_batch = Normal(new_policy_batch[0], new_policy_batch[1]).sample() action = tf.clip_by_value(sample_batch, -1,1) action_batch.append(action) # Sample action batch in forward direction, use old action in backward direction hot_action_batch.append(action) else: # discrete control distribution = Categorical(actor_head) action = distribution.sample(one_hot=False) # Sample action batch in forward direction, use old action in backward direction action_batch.append(action) hot_action_batch.append(distribution.get_sample_one_hot(action)) # Give self esplicative name to output for easily retrieving it in frozen graph # tf.identity(action_batch, name="action") return action_batch, hot_action_batch
def get_goal_reward(self, goal): """ Returns the reward distribution for a goal state taking the best path to the goal into account. """ max_path_reward = self.get_max_path_reward(goal) # Update the distribution by the value along the path goal_state = self._state[goal] if hasattr(goal_state, "sample"): if hasattr(goal_state, "mu") and hasattr(goal_state, "sigma"): return Normal(goal_state.mu + max_path_reward, goal_state.sigma) elif hasattr(goal_state, "vals") and hasattr(goal_state, "probs"): vals = tuple( [value + max_path_reward for value in goal_state.vals]) return Categorical(vals, goal_state.probs) else: print(f"Type {type(goal_state)} not supported.") raise NotImplementedError() else: return goal_state + max_path_reward
def __init__(self, ob_readings: dict, wound: Wound, priors: list = None, t_units='minutes'): """ Perform inference on observed bias readings to infer the posterior distribution over the attractant dynamics parameters {q, D, τ, R0, κ, m, b0}. A dictionary specifying the observed bias readings must be provided, along with a certain instantiated wound (which can be a PointWound, a CellsOnWoundMargin or CellsInsideWound) . The observed bias readings should be a dictionary with elements of the following form: {(r1, t1): (mu1, sig1), (r2, t2): (mu2, sig2) ... } r and t specify the spatial and temporal location where the observed bias has been measured, (this could be the mid-point of their respective bins), and mu and sig represent the mean and standard deviation of the posterior of the observed bias at this location. DISTANCES SHOULD BE MEASURED IN MICRONS time can be measured in minutes or seconds: specify this with the t_units argument. The parameters are measured in the following units: q: Mmol / min D: µm^2 / min τ: min R0: Mmol / µm^2 kappa: Mmol / µm^2 m: µm^2 / Mmol b0: unitless Parameters ---------- ob_readings The observed bias readings wound A Wound class, which the observed bias is assumed to be generated from priors A list of distributions, one element per parameter, specifying the priors t_units The units which time is measured in, in the ob_readings dictionary keys """ super().__init__() self.wound = wound assert t_units in [ 'seconds', 'minutes' ], 't_units must be either "seconds" or "minutes" but it is {}'.format( t_units) # the total number of readings self.TS = len(ob_readings) # extract a list of rs, ts, mus and sigs self.r = np.array([r for r, t in ob_readings.keys()]) self.t = np.array([t for r, t in ob_readings.keys()]) mus = np.array([mu for mu, sig in ob_readings.values()]) sigs = np.array([sig for mu, sig in ob_readings.values()]) # convert to minutes if t_units is 'seconds': self.t /= 60 # this is our multivariate Gaussian observed bias distribution self.ob_dists = multivariate_normal(mus, sigs**2) # these are the default priors if priors is None: self.priors = [ Normal(5 * 60, 4 * 60), Normal(400, 300), Normal(60, 16), Normal(0.3, 0.2), Normal(0.1, 0.2), Normal(4, 4), Normal(0.001, 0.0005) ] else: assert isinstance(priors, list) assert len(priors) == 7 self.priors = priors