def generate_signal(self, key, context): if key == 'action_values_all': utils = self.agent.get_utils(self, context) values = tf.identity(utils, name="{}_{}_values".format(self.agent.name, self.name)) return values elif key == 'action_values': action_values = context.get_signal('action_values_all', self, gradient=True) actions = context.get_signal('actions') action_values = tf.reduce_sum(actions * action_values, axis=-1, keepdims=True) mask = context.get_signal('mask') label = "{}-estimated_action_value".format(self.display_name) context.add_recorded_value(label, masked_mean(action_values, mask)) return action_values elif key == 'one_step_td_errors': rewards = context.get_signal('rewards') gamma = context.get_signal('gamma') action_values = context.get_signal('action_values', self, gradient=True) shifted_values = tf_roll(action_values, 1, fill=0.0, reverse=True, axis=0) one_step_estimate = rewards + gamma * shifted_values td_errors = one_step_estimate - action_values mask = context.get_signal('mask') label = "{}-one_step_td_error".format(self.display_name) context.add_recorded_value(label, masked_mean(td_errors, mask)) return td_errors else: raise Exception()
def generate_signal(self, signal_key, context): if signal_key == "advantage": q = context.get_signal('action_values', self.q_estimator) v = context.get_signal('values', self.v_estimator) advantage = q - v advantage = self.post_process(advantage, context) mask = context.get_signal("mask") mean_advantage = masked_mean(advantage, mask) context.add_recorded_value("advantage", mean_advantage) mean_abs_advantage = masked_mean(tf.abs(advantage), mask) context.add_recorded_value("abs_advantage", mean_abs_advantage) return advantage elif signal_key == "advantage_all": q = context.get_signal('action_values_all', self.q_estimator) v = context.get_signal('values', self.v_estimator) v = v[..., None] advantage = q - v advantage = self.post_process(advantage, context) return advantage else: raise Exception("NotImplemented")
def post_process(self, advantage, context): if self.standardize: mask = context.get_signal('mask') mean = masked_mean(advantage, mask) _advantage = advantage - mean variance = masked_mean(_advantage**2, mask) std = tf.sqrt(variance) _advantage = tf.cond(std <= 0, lambda: _advantage, lambda: _advantage/std) advantage = mask * _advantage + (1-mask) * advantage return advantage
def mean_kl(p, q, obs, mask): """ `p` and `q` are instances of `Policy`. """ # from tensorflow.python.ops.rnn import dynamic_rnn # kl_cell = KLCell(policy, prev_policy) # batch_size = tf.shape(obs)[1] # initial_state = kl_cell.zero_state(batch_size, tf.float32) # kl, _ = dynamic_rnn( # kl_cell, obs, initial_state=initial_state, # parallel_iterations=1, swap_memory=False, # time_major=True) # return tf.reduce_mean(kl) # Code below requires that we know T at graph build time; need to do this in a python # while loop, because taking hessian_vector_products when the thing we are taking # the hessian of includes a tensorflow while loop is not currently supported batch_size = tf.shape(obs)[1] dtype = tf.float32 p_state, q_state = p.zero_state(batch_size, dtype), q.zero_state(batch_size, dtype) kl = [] T = int(obs.shape[0]) for t in range(T): p_utils, p_state = p.build_update(obs[t, :, :], p_state) q_utils, q_state = q.build_update(obs[t, :, :], q_state) kl.append(p.build_kl(p_utils, q_utils)) kl = tf.stack(kl) return masked_mean(kl, mask)
def build_update(self, context): self.delta = build_scheduled_value(self.delta_schedule, "delta") tvars = self.trainable_variables(for_opt=True) self.gradient = tf.gradients(context.objective, tvars) mask = context.get_signal('mask') kl = context.get_signal('kl', self.policy) mean_kl = masked_mean(kl, mask) self.fv_product = HessianVectorProduct(mean_kl, tvars) self.grad_norm_pure = tf.placeholder(tf.float32, shape=(), name="_grad_norm_pure") self.grad_norm_natural = tf.placeholder(tf.float32, shape=(), name="_grad_norm_natural") self.step_norm = tf.placeholder(tf.float32, shape=(), name="_step_norm") context.add_recorded_values(grad_norm_pure=self.grad_norm_pure, grad_norm_natural=self.grad_norm_natural, step_norm=self.step_norm, train_only=True)
def build_graph(self, context): adv_times_ratio = context.get_signal("adv_times_ratio", self, gradient=True) mask = context.get_signal("mask") objective = masked_mean(adv_times_ratio, mask) label = "{}-policy_gradient_objective".format(self.policy.display_name) context.add_recorded_value(label, objective) return objective
def build_graph(self, context): loss = context.get_signal("loss", self, gradient=True) mask = context.get_signal("mask") objective = -masked_mean(loss, mask) label = "{}-differentiable_objective".format(self.policy.display_name) context.add_recorded_value(label, objective) return objective
def build_graph(self, context): entropy = context.get_signal('entropy', self, gradient=True) mask = context.get_signal('mask') objective = masked_mean(entropy, mask) label = "{}-entropy".format(self.policy.display_name) context.add_recorded_value(label, objective) return objective
def build_graph(self, context): td_error = context.get_signal("td_error", self) squared_td_error = context.get_signal("squared_td_error", self, gradient=True) mask = context.get_signal('mask') weights = context.get_signal('weights') if self.use_weights: td_error *= weights squared_td_error *= weights mean_td_error = masked_mean(td_error, mask) label = "{}-opt-mean_abs_td_error".format(self.value_function.display_name) context.add_recorded_value(label, tf.abs(mean_td_error)) mean_squared_td_error = masked_mean(squared_td_error, mask) label = "{}-opt-mean_squared_td_error".format(self.value_function.display_name) context.add_recorded_value(label, mean_squared_td_error) return -mean_squared_td_error
def generate_signal(self, key, context, **kwargs): if key == 'values': utils = self.agent.get_utils(self, context) values = tf.identity(utils, name="{}_{}_values".format(self.agent.name, self.name)) mask = context.get_signal('mask') label = "{}-estimated_value".format(self.display_name) context.add_recorded_value(label, masked_mean(values, mask)) return values elif key == 'one_step_td_errors': rewards = context.get_signal('rewards') gamma = context.get_signal('gamma') c = kwargs.get('c', None) rho = context.get_signal('rho', self.policy, c=c) values = context.get_signal('values', self, gradient=True) shifted_values = tf_roll(values, 1, fill=0.0, reverse=True, axis=0) one_step_estimate = rho * (rewards + gamma * shifted_values) td_errors = one_step_estimate - values mask = context.get_signal('mask') label = "{}-one_step_td_error".format(self.display_name) context.add_recorded_value(label, masked_mean(td_errors, mask)) return td_errors elif key == 'monte_carlo_td_errors': discounted_returns = context.get_signal('discounted_returns', self.policy) values = context.get_signal('values', self, gradient=True) return discounted_returns[:-1, ...] - values[:-1, ...] else: raise Exception()
def generate_signal(self, key, context, **kwargs): if key == 'log_probs': utils = self.agent.get_utils(self, context) actions = context.get_signal('actions') return self.action_selection.log_probs(utils, actions, self.exploration) elif key == 'entropy': utils = self.agent.get_utils(self, context) return self.action_selection.entropy(utils, self.exploration) elif key == 'samples': utils = self.agent.get_utils(self, context) return self.action_selection.sample(utils, self.exploration) elif key == 'kl': raise Exception("NotImplemented") elif key in ['monte_carlo_values', 'monte_carlo_action_values']: c = kwargs.get('c', None) rho = context.get_signal('rho', self, c=c) rewards = context.get_signal('rewards') if key == 'monte_carlo_action_values': rho = tf_roll(rho, 1, fill=1.0, reverse=True) gamma = context.get_signal('gamma') elems = (tf.reverse(rho, axis=[0]), tf.reverse(rewards, axis=[0])) initializer = tf.zeros_like(rewards[0, ...]) if key == 'monte_carlo_action_values': func = _DoWeightingActionValue(gamma) else: func = _DoWeightingValue(gamma) returns = tf.scan( func, elems=elems, initializer=initializer, ) returns = tf.reverse(returns, axis=[0]) return returns elif key == 'average_monte_carlo_values': values = context.get_signal('monte_carlo_values', self, **kwargs) average = tf.reduce_mean(values, axis=1, keepdims=True) average += tf.zeros_like(values) return average elif key == 'importance_weights': pi_log_probs = context.get_signal("log_probs", self) mu_log_probs = context.get_signal("mu_log_probs") importance_weights = tf.exp(pi_log_probs - mu_log_probs) label = "{}-mean_importance_weight".format(self.display_name) mask = context.get_signal("mask") context.add_recorded_value(label, masked_mean(importance_weights, mask)) return importance_weights elif key == 'rho': c = kwargs.get('c', None) importance_weights = context.get_signal("importance_weights", self) if c is not None: if c <= 0: rho = importance_weights else: rho = tf.minimum(importance_weights, c) else: rho = tf.ones_like(importance_weights) label = "{}-mean_rho_c_{}".format(self.display_name, c) mask = context.get_signal("mask") context.add_recorded_value(label, masked_mean(rho, mask)) return rho else: raise Exception("NotImplemented")
def generate_signal(self, signal_key, context): if signal_key == "action_values" and self.to_action_value: pass elif signal_key == "values" and not self.to_action_value: pass else: raise Exception("NotImplemented") rewards = context.get_signal("rewards") rho = context.get_signal("rho", self.policy, c=self.importance_c) if self.from_action_value: if isinstance(self.policy, DiscretePolicy): pi_log_probs_all = context.get_signal("log_probs_all", self.policy) pi_probs = tf.exp(pi_log_probs_all) action_values = context.get_signal("action_values", self.value_function) values = tf.reduce_sum(pi_probs * action_values, axis=-1, keepdims=True) else: action_values = context.get_signal("action_values", self.value_function) values = action_values * rho else: values = context.get_signal("values", self.value_function) R = rewards V = tf_roll(values, 1, fill=0.0, reverse=True) RHO = rho # if context.truncated_rollouts: # R = R[:-1, ...] # V = V[:-1, ...] # RHO = RHO[:-1, ...] if self.to_action_value: RHO = tf_roll(RHO, 1, fill=1.0, reverse=True) gamma = context.get_signal("gamma") retrace_cell = RetraceCell( rewards.shape[-1], gamma, self.lmbda, self.to_action_value) retrace_input = ( tf.reverse(RHO, axis=[0]), tf.reverse(R, axis=[0]), tf.reverse(V, axis=[0]), ) (retrace, one_step_estimate, adjustment), _ = dynamic_rnn( retrace_cell, retrace_input, initial_state=V[-1, ...], parallel_iterations=1, swap_memory=False, time_major=True) one_step_estimate = tf.reverse(one_step_estimate, axis=[0]) adjustment = tf.reverse(adjustment, axis=[0]) retrace = tf.reverse(retrace, axis=[0]) # if context.truncated_rollouts: # retrace = tf.concat([retrace, V[-1, ...]], axis=0) mask = context.get_signal("mask") label = "{}-one_step_estimate".format(self.name) context.add_recorded_value(label, masked_mean(one_step_estimate, mask)) label = "{}-adjustment".format(self.name) context.add_recorded_value(label, masked_mean(adjustment, mask)) label = "{}-retrace".format(self.name) context.add_recorded_value(label, masked_mean(retrace, mask)) return retrace
def build_core_signals(self): self._signals['mask'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_mask") self._signals['done'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_done") self._signals['all_obs'] = tf.placeholder( tf.float32, shape=(cfg.T + 1 if cfg.T is not None else None, None) + self.obs_shape, name="_all_obs") # observations that we learn about self._signals['obs'] = tf.identity(self._signals['all_obs'][:-1, ...], name="_obs") # observations that we use as targets self._signals['target_obs'] = tf.identity( self._signals['all_obs'][1:, ...], name="_target_obs") self._signals['actions'] = tf.placeholder(tf.float32, shape=(cfg.T, None) + self.action_shape, name="_actions") self._signals['gamma'] = tf.constant(self.gamma) self._signals['batch_size'] = tf.shape(self._signals['obs'])[1] self._signals['batch_size_float'] = tf.cast( self._signals['batch_size'], tf.float32) self._signals['rewards'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_rewards") self._signals['returns'] = tf.cumsum(self._signals['rewards'], axis=0, reverse=True, name="_returns") self._signals['reward_per_ep'] = tf.reduce_mean(tf.reduce_sum( self._signals['rewards'], axis=0), name="_reward_per_ep") self.add_recorded_values(reward_per_ep=self._signals['reward_per_ep']) self._signals['mode'] = tf.placeholder(tf.string, ()) self._signals['weights'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_weights") T = tf.shape(self._signals['mask'])[0] discount_matrix = tf_discount_matrix(self.gamma, T) discounted_returns = tf.tensordot(discount_matrix, self._signals['rewards'], axes=1, name="_discounted_returns") self._signals['discounted_returns'] = discounted_returns mean_returns = masked_mean(discounted_returns, self._signals['mask'], axis=1, keepdims=True) mean_returns += tf.zeros_like(discounted_returns) self._signals['average_discounted_returns'] = mean_returns # off-policy self._signals['mu_utils'] = tf.placeholder(tf.float32, shape=( cfg.T, None, ) + self.mu.param_shape, name="_mu_log_probs") self._signals['mu_exploration'] = tf.placeholder( tf.float32, shape=(None, ), name="_mu_exploration") self._signals['mu_log_probs'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_mu_log_probs") for obj in self.rl_objects: obj.build_core_signals(self)
def build_graph(self, context): prev_values = context.get_signal('prev_values', self) values = context.get_signal('values', self.value_function, gradient=True) variance = context.get_signal('variance', self) targets = context.get_signal('values', self.target_generator) if self.direct: std = tf.sqrt(variance) constrained_values = tf.clip_by_value( values, prev_values - std * self.epsilon, prev_values + std * self.epsilon) objective = -(constrained_values - targets)**2 divergence = tf.abs(constrained_values - prev_values) if self.use_weights: weights = context.get_signal('weights') objective *= weights mask = context.get_signal("mask") mean_divergence = masked_mean(tf.reduce_mean(divergence, axis=-1, keepdims=True), mask) label = "{}-opt-mean_ve_divergence".format(self.value_function.display_name) context.add_recorded_value(label, mean_divergence) objective = masked_mean(tf.reduce_mean(objective, axis=-1, keepdims=True), mask) label = "{}-opt-ve_direct_objective".format(self.value_function.display_name) context.add_recorded_value(label, objective) td_error = context.get_signal("td_error", self) squared_td_error = context.get_signal("squared_td_error", self) mean_td_error = masked_mean(td_error, mask) label = "{}-opt-mean_abs_td_error".format(self.value_function.display_name) context.add_recorded_value(label, tf.abs(mean_td_error)) mean_squared_td_error = masked_mean(squared_td_error, mask) label = "{}-opt-mean_squared_td_error".format(self.value_function.display_name) context.add_recorded_value(label, mean_squared_td_error) return objective else: clipped_ratio = None if self.n_samples == 0: ratio = tf.exp(0.5 * (2 * targets - values - prev_values) * (values - prev_values) / variance) # prev_advantage = (values - targets) ** 2 + variance prev_advantage = (prev_values - targets) ** 2 + variance if self.epsilon is None: adv_times_ratio = ratio * prev_advantage else: clipped_ratio = tf.clip_by_value(ratio, 1-self.epsilon, 1+self.epsilon) adv_times_ratio = tf.minimum( prev_advantage * ratio, prev_advantage * clipped_ratio ) else: T = tf.shape(values)[0] batch_size = tf.shape(values)[1] samples = tf.random_normal((T, batch_size, self.n_samples)) * tf.sqrt(variance) + prev_values ratio = tf.exp(0.5 * (2 * samples - values - prev_values) * (values - prev_values) / variance) prev_advantage = (prev_values - targets) ** 2 + variance - (samples - targets)**2 if self.epsilon is None: adv_times_ratio = ratio * prev_advantage else: clipped_ratio = tf.clip_by_value(ratio, 1-self.epsilon, 1+self.epsilon) adv_times_ratio = tf.minimum( prev_advantage * ratio, prev_advantage * clipped_ratio ) if self.use_weights: weights = context.get_signal('weights') adv_times_ratio *= weights mask = context.get_signal("mask") mean_ratio = masked_mean(tf.reduce_mean(ratio, axis=-1, keepdims=True), mask) label = "{}-opt-mean_ve_ratio".format(self.value_function.display_name) context.add_recorded_value(label, mean_ratio) if clipped_ratio is not None: mean_clipped_ratio = masked_mean(tf.reduce_mean(clipped_ratio, axis=-1, keepdims=True), mask) label = "{}-opt-mean_ve_clipped_ratio".format(self.value_function.display_name) context.add_recorded_value(label, mean_clipped_ratio) mean_advantage = masked_mean(tf.reduce_mean(prev_advantage, axis=-1, keepdims=True), mask) label = "{}-opt-mean_ve_advantage".format(self.value_function.display_name) context.add_recorded_value(label, mean_advantage) objective = masked_mean(tf.reduce_mean(adv_times_ratio, axis=-1, keepdims=True), mask) label = "{}-opt-ve_objective".format(self.value_function.display_name) context.add_recorded_value(label, objective) td_error = context.get_signal("td_error", self) squared_td_error = context.get_signal("squared_td_error", self) mean_td_error = masked_mean(td_error, mask) label = "{}-opt-mean_abs_td_error".format(self.value_function.display_name) context.add_recorded_value(label, tf.abs(mean_td_error)) mean_squared_td_error = masked_mean(squared_td_error, mask) label = "{}-opt-mean_squared_td_error".format(self.value_function.display_name) context.add_recorded_value(label, mean_squared_td_error) return objective
def generate_signal(self, signal_key, context, **kwargs): if signal_key == "prev_log_probs": self.log_probs = context.get_signal('log_probs', self.policy) self.prev_log_probs = tf.placeholder(tf.float32, shape=self.log_probs.shape, name="_prev_log_probs") return self.prev_log_probs elif signal_key == "prev_advantage": self.advantage = context.get_signal('advantage', self.advantage_estimator) self.prev_advantage = tf.placeholder(tf.float32, shape=self.advantage.shape, name="_prev_advantage") return self.prev_advantage elif signal_key == 'importance_weights': pi_log_probs = context.get_signal("prev_log_probs", self) mu_log_probs = context.get_signal("mu_log_probs") importance_weights = tf.exp(pi_log_probs - mu_log_probs) label = "{}-mean_importance_weight".format(self.name) mask = context.get_signal("mask") context.add_recorded_value(label, masked_mean(importance_weights, mask)) return importance_weights elif signal_key == "rho": c = kwargs.get('c', None) importance_weights = context.get_signal("importance_weights", self) if c is not None: if c <= 0: rho = importance_weights else: rho = tf.minimum(importance_weights, c) else: rho = tf.ones_like(importance_weights) label = "{}-mean_rho_c_{}".format(self.name, c) mask = context.get_signal("mask") context.add_recorded_value(label, masked_mean(rho, mask)) return rho elif signal_key == "adv_times_ratio": log_probs = context.get_signal('log_probs', self.policy, gradient=True) prev_log_probs = context.get_signal('prev_log_probs', self) ratio = tf.exp(log_probs - prev_log_probs) prev_advantage = context.get_signal('prev_advantage', self) if self.epsilon is None or self.epsilon <= 0: adv_times_ratio = ratio * prev_advantage else: adv_times_ratio = tf.minimum( prev_advantage * ratio, prev_advantage * tf.clip_by_value(ratio, 1-self.epsilon, 1+self.epsilon)) if self.use_weights: weights = context.get_signal('weights') adv_times_ratio *= weights rho = context.get_signal('rho', self, c=self.importance_c) adv_times_ratio *= rho return adv_times_ratio else: raise Exception("NotImplemented")