def _pMA_VPG_train(self, make_obs_ph_n, make_memory_ph_n, make_h_ph_n, make_c_ph_n, make_act_ph_n, action_space_n, make_return_ph_n, p_func, grad_norm_clipping=None, scope="agent", reuse=None): with tf.compat.v1.variable_scope(scope, reuse=reuse): # create distributions act_pdtype_n = [make_pdtype(act_space, self.args.env_type) for act_space in action_space_n] # set up placeholders obs_ph_n = make_obs_ph_n memory_ph_n = make_memory_ph_n h_ph_n = make_h_ph_n c_ph_n = make_c_ph_n act_onehot_ph = make_act_ph_n[self.p_index] return_ph = make_return_ph_n[self.p_index] # Feed all inputs. Let the model decide what to choose. p_input = self._p_setup_placeholder(obs_ph_n, h_ph_n, c_ph_n, memory_ph_n) p, enc_state, memory_state, attention, value = p_func(p_input, int(act_pdtype_n[self.p_index].param_shape()[0]), self.p_index, self.n, self.n_start, self.n_end, scope="p_func", reuse=reuse) # wrap parameters in distribution and sample act_pd = act_pdtype_n[self.p_index].pdfromflat(p) act_soft_sample = act_pd.sample(noise=False) # print(act_soft_sample) act_onehot = tf.multinomial(act_soft_sample[-1,:,:], 1) # print(act_onehot) value_out = tf.squeeze(value, axis=0) # remove the time dimension from the output for storing in the buffer return_ph_expd = tf.expand_dims(return_ph, axis=-1) # Value Network Optimization # value = tf.squeeze(value, axis=-1) # remove the last single out dim, to align with return (#trajlen, #batch) target = return_ph_expd - value loss_v = tf.reduce_mean(tf.math.squared_difference(value, return_ph_expd)) optim_v = self.optimizer.minimize(loss_v, name='adam_optim_v') # Policy Network Optimization # print(act_soft_sample) target_pi = tf.squeeze(target, axis=-1) loss_pi = tf.reduce_mean(tf.stop_gradient(target_pi) * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=p, labels=act_onehot_ph), name='loss_pi') optim_pi = self.optimizer.minimize(loss_pi, name='adam_optim_pi') # Create callable functions # policy network # Use sess.run to the feed the dictionary, since we are not calling it anywhere else, simi update_pi = optim_pi update_v = optim_v train_v = U.function(inputs=p_input + [return_ph], outputs=update_v) train_pi = U.function(inputs=p_input + [act_onehot_ph] + [return_ph], outputs=update_pi) act = U.function(inputs=p_input, outputs=[act_onehot, act_soft_sample, enc_state, memory_state, attention, value_out]) return act, train_pi, train_v
def make_update_exp(vals, target_vals, polyak): polyak = 1.0 - polyak expression = [] for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var)) expression = tf.group(*expression) return U.function([], [], updates=[expression])
def _qMA_train(self, critic_index, make_obs_ph_n, make_q_gru_ph_n, make_act_ph_n, make_target_ph, importance_in, q_func, optimizer, grad_norm_clipping=None, scope="trainer", reuse=None): with tf.compat.v1.variable_scope(scope, reuse=reuse): # set up placeholders obs_ph_n = make_obs_ph_n q_gru_ph_n = make_q_gru_ph_n act_ph_n = make_act_ph_n target_ph = make_target_ph q_input = self._q_setup_placeholder(obs_ph_n, q_gru_ph_n[self.p_index], act_ph_n) q, q_gru_state = q_func(q_input, self.n, self.args, scope="q_func" + str(self.p_index) + str(critic_index), reuse=reuse, p_index=self.p_index) q_func_vars = U.scope_vars( U.absolute_scope_name("q_func" + str(self.p_index) + str(critic_index))) q = q[:, :, 0] q_error = q - target_ph if self.args.PER_sampling: q_loss = tf.reduce_mean( tf.multiply(tf.square(q_error), importance_in)) else: q_loss_t = tf.reduce_sum(tf.square(q_error), axis=1) / tf.to_float( self.args.len_traj_update) q_loss = tf.reduce_sum(q_loss_t, axis=0) / tf.to_float( self.args.batch_size) # viscosity solution to Bellman differential equation in place of an initial condition q_reg = tf.reduce_mean(tf.square(q)) loss = q_loss + 1e-3 * q_reg optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) # Create callable functions if self.args.PER_sampling: train = U.function(inputs=q_input + [importance_in] + [target_ph], outputs=[loss, q_error], updates=[optimize_expr]) else: train = U.function(inputs=q_input + [target_ph], outputs=[loss, q_error], updates=[optimize_expr]) q_values = U.function(q_input, [q, q_gru_state]) # target network target_q, t_q_gru_state = q_func( q_input, self.n, self.args, scope="target_q_func" + str(self.p_index) + str(critic_index), reuse=reuse, p_index=self.p_index) target_q = target_q[:, :, 0] target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func" + str(self.p_index) + str(critic_index))) update_target_q = make_update_exp(q_func_vars, target_q_func_vars, self.args.polyak) target_q_values = U.function(q_input, [target_q, t_q_gru_state]) return train, update_target_q, { 'q_values': q_values, 'target_q_values': target_q_values }
def _pMA_train(self, make_obs_ph_n, make_memory_ph_n, make_q_gru_ph_n, make_h_ph_n, make_c_ph_n, make_act_ph_n, action_space_n, importance_in, p_func, q_func, optimizer, grad_norm_clipping=None, scope="agent", reuse=None): with tf.compat.v1.variable_scope(scope, reuse=reuse): # create distributions act_pdtype_n = [ make_pdtype(act_space, self.args.env_type) for act_space in action_space_n ] # set up placeholders obs_ph_n = make_obs_ph_n memory_ph_n = make_memory_ph_n h_ph_n = make_h_ph_n c_ph_n = make_c_ph_n act_ph_n = make_act_ph_n q_gru_ph = make_q_gru_ph_n[self.p_index] # Feed all inputs. Let the model decide what to choose. p_input = self._p_setup_placeholder(obs_ph_n, h_ph_n, c_ph_n, memory_ph_n, q_gru_ph) p, enc_state, memory_state, attention = p_func( p_input, int(act_pdtype_n[self.p_index].param_shape()[0]), self.p_index, self.n, self.n_start, self.n_end, scope="p_func", reuse=reuse) # Get parent/relative scope of the policy function p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) # wrap parameters in distribution act_pd = act_pdtype_n[self.p_index].pdfromflat(p) if not (self.args.benchmark or self.args.display): act_sample = act_pd.sample( ) # Add gumbel noise to prediction for regularization else: act_sample = act_pd.sample( noise=False) # only softmax, no noise # Calculate loss # p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) p_reg_t = tf.reduce_sum(tf.square(act_pd.flatparam()), axis=1) / tf.to_float( self.args.len_traj_update) p_reg = tf.reduce_sum(p_reg_t, axis=0) / tf.to_float( self.args.batch_size) act_input_n = act_ph_n + [] # Use Gumbel Out for calculating policy loss act_input_n[self.p_index] = act_pd.sample() q_input = self._qp_setup_placeholder(p_input, act_input_n) q, state = q_func(q_input, self.n, self.args, scope="q_func" + str(self.p_index) + "1", reuse=True, p_index=self.p_index) q = q[:, :, 0] # Calculate policy loss # pg_loss = -tf.reduce_mean(q) pg_loss_t = tf.reduce_sum(q, axis=1) / tf.to_float( self.args.len_traj_update) pg_loss = -tf.reduce_sum(pg_loss_t, axis=0) / tf.to_float( self.args.batch_size) loss = pg_loss + p_reg * 1e-3 optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) # Create callable functions # policy network train = U.function(inputs=p_input + act_ph_n, outputs=loss, updates=[optimize_expr]) act = U.function( inputs=p_input, outputs=[act_sample, enc_state, memory_state, attention]) p_values = U.function(p_input, p) # target network (Use one hot for discrete) target_p, t_enc_state, target_memory, _ = p_func( p_input, int(act_pdtype_n[self.p_index].param_shape()[0]), self.p_index, self.n, self.n_start, self.n_end, scope="target_p_func", reuse=reuse) target_p_func_vars = U.scope_vars( U.absolute_scope_name("target_p_func")) update_target_p = make_update_exp(p_func_vars, target_p_func_vars, self.polyak) # if self.args.env_type == "ic3net": noise_target = False target_act_sample = act_pdtype_n[self.p_index].pdfromflat( target_p).sample(noise=True) target_act = U.function( inputs=p_input, outputs=[target_act_sample, t_enc_state, target_memory]) return act, train, update_target_p, { 'p_values': p_values, 'target_act': target_act }