def entropy(self): rescaled_logits = self.logits - tf.reduce_max( self.logits, axis=-1, keepdims=True) exp_logits = tf.exp(rescaled_logits) z = tf.reduce_sum(exp_logits, axis=-1, keepdims=True) p = exp_logits / z return tf.reduce_sum(p * (tf.log(z) - rescaled_logits), axis=-1, keepdims=True)
def calc_pi_loss(logic_outs, actions, advantages): """Calculate policy gradient loss.""" cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=actions, logits=logic_outs) advantages = tf.stop_gradient(advantages) pg_loss_per_step = cross_entropy * advantages return tf.reduce_sum(pg_loss_per_step)
def kl(self, other): assert isinstance(other, CategoricalDist), 'Distribution type not match.' rescaled_logits_self = self.logits - tf.reduce_max( self.logits, axis=-1, keepdims=True) rescaled_logits_other = other.logits - tf.reduce_max( other.logits, axis=-1, keepdims=True) exp_logits_self = tf.exp(rescaled_logits_self) exp_logits_other = tf.exp(rescaled_logits_other) z_self = tf.reduce_sum(exp_logits_self, axis=-1, keepdims=True) z_other = tf.reduce_sum(exp_logits_other, axis=-1, keepdims=True) p = exp_logits_self / z_self return tf.reduce_sum(p * (rescaled_logits_self - tf.log(z_self) - rescaled_logits_other + tf.log(z_other)), axis=-1, keepdims=True)
def kl(self, other): assert isinstance(other, DiagGaussianDist), 'Distribution type not match.' return tf.reduce_sum( (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) + other.log_std - self.log_std - 0.5, axis=-1, keepdims=True)
def gather_custom(self, inputs, indices): indices = tf.cast(indices, tf.uint8) one_hot = tf.squeeze( tf.one_hot(indices=indices, depth=self.n_actions, on_value=1., off_value=0., axis=-1, dtype=tf.float32), axis=-2) mul_test = tf.multiply(inputs, one_hot) # reduce_sum_val = tf.reduce_sum(mul_test, axis=-1, keep_dims=True) reduce_sum_val = tf.reduce_sum(mul_test, axis=-1) return reduce_sum_val
def build_train_graph(self): """ Build train graph. Because of the different seq_max(1 vs limit), train graph cannot connect-up to actor.graph directly. Hence, we build an explore sub-graph and train sub-graph, which sync with tf.assign between two collections. :return: """ with self.graph.as_default(): with tf.variable_scope("eval_agent"): trajectory_agent_outs, _ = self.build_agent_net( inputs_obs=self.ph_train_obs, seq_max=self.fix_seq_length + 1, # importance obs_lengths=self.ph_train_obs_len, hidden_state_in=None, # total trajectory, needn't hold hidden ) with tf.variable_scope("target_agent"): tar_agent_outs_tmp, _ = self.build_agent_net( inputs_obs=self.ph_train_obs, # fix value, different between explore and train seq_max=self.fix_seq_length + 1, obs_lengths=self.ph_train_obs_len, hidden_state_in=None, ) target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp) _eval_agent_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent") _target_agent_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent") with tf.variable_scope("soft_replacement"): self.agent_train_replace_op = [ tf.assign(t, e) for t, e in zip(_target_agent_paras, _eval_agent_paras)] self.agent_explore_replace_op = [ tf.assign(t, e) for t, e in zip(self._explore_paras, _eval_agent_paras) ] self._print_trainable_var_name( _eval_agent_paras=_eval_agent_paras, _target_agent_paras=_target_agent_paras, _explore_paras=self._explore_paras, ) # agent out to max q values # Calculate estimated Q-Values ---------------- mac_out = tf.reshape( trajectory_agent_outs, [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1], ) logging.debug("mac_out: {}".format(mac_out)) chosen_action_qvals = self.gather_custom(mac_out[:, :-1], self.ph_actions) # Calculate the Q-Values necessary for the target ----------- target_mac_out = tf.reshape( target_trajectory_agent_outs, [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1], ) target_mac_out = target_mac_out[:, 1:] # Mask out unavailable actions # target_mac_out[avail_actions[:, 1:] == 0] = -9999999 indices = tf.equal(self.ph_avail_action[:, 1:], 0) mask_val = tf.tile( [[[[-999999.0]]]], [ self.batch_size, self.fix_seq_length, self.n_agents, self.avail_action_num, ], ) logging.debug("indices:{}, mask_val:{}, target mac out:{}".format( indices, mask_val, target_mac_out)) target_mac_out = tf.where(indices, mask_val, target_mac_out) if self.use_double_q: # Get actions that maximise live Q (for double q-learning) mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:])) mac_out_detach = tf.where(indices, mask_val, mac_out_detach) cur_max_actions = tf.expand_dims( tf.argmax(mac_out_detach, axis=-1), -1) target_max_qvals = self.gather_custom(target_mac_out, cur_max_actions) else: target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1]) # eval mixer --------------- with tf.variable_scope("eval_mixer"): self.q_tot = self._build_mix_net2(chosen_action_qvals, self.ph_train_states) with tf.variable_scope("target_mixer"): q_tot_tmp = self._build_mix_net2(target_max_qvals, self.ph_train_target_states) self.target_q_tot = tf.stop_gradient(q_tot_tmp) _eval_mix_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer") _target_mix_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer") with tf.variable_scope("soft_replacement"): self.mix_train_replace_op = [ tf.assign(t, e) for t, e in zip(_target_mix_paras, _eval_mix_paras)] self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras, _target_mix_paras=_target_mix_paras) # Calculate 1-step Q-Learning targets targets = (self.ph_rewards + self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot) # Td-error td_error = self.q_tot - tf.stop_gradient(targets) # mask = mask.expand_as(td_error) #fixme: default as same shape! # 0-out the targets that came from padded data masked_td_error = tf.multiply(td_error, self.ph_mask) self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask) # Optimise optimizer = tf.train.RMSPropOptimizer( self.lr, decay=0.95, epsilon=1.5e-7, centered=True) grads_and_vars = optimizer.compute_gradients(self.loss) capped_gvs = [( grad if grad is None else tf.clip_by_norm( grad, clip_norm=self.grad_norm_clip), var, ) for grad, var in grads_and_vars] self.grad_update = optimizer.apply_gradients(capped_gvs)
def calc_entropy_loss(logic_outs): """Calculate entropy loss.""" pi = tf.nn.softmax(logic_outs) log_pi = tf.nn.log_softmax(logic_outs) entropy_per_step = tf.reduce_sum(-pi * log_pi, axis=-1) return -tf.reduce_sum(entropy_per_step)
def calc_baseline_loss(advantages): """Calculate the baseline loss.""" return 0.5 * tf.reduce_sum(tf.square(advantages))
def entropy(self): return tf.reduce_sum(self.log_std + 0.5 * (np.log(2.0 * np.pi) + 1.0), axis=-1, keepdims=True)
def neglog_prob(self, x): return 0.5 * np.log(2.0 * np.pi) * tf.cast((tf.shape(x)[-1]), tf.float32) + \ 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1, keepdims=True) + \ tf.reduce_sum(self.log_std, axis=-1, keepdims=True)