def _pMA_VPG_train(self, make_obs_ph_n, make_memory_ph_n, make_h_ph_n, make_c_ph_n, make_act_ph_n, action_space_n, make_return_ph_n, p_func, grad_norm_clipping=None, scope="agent", reuse=None):
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # create distributions
            act_pdtype_n = [make_pdtype(act_space, self.args.env_type) for act_space in action_space_n]

            # set up placeholders
            obs_ph_n = make_obs_ph_n
            memory_ph_n = make_memory_ph_n
            h_ph_n = make_h_ph_n
            c_ph_n = make_c_ph_n
            act_onehot_ph = make_act_ph_n[self.p_index]
            return_ph = make_return_ph_n[self.p_index]

            # Feed all inputs. Let the model decide what to choose.
            p_input = self._p_setup_placeholder(obs_ph_n, h_ph_n, c_ph_n, memory_ph_n)
            p, enc_state, memory_state, attention, value = p_func(p_input, int(act_pdtype_n[self.p_index].param_shape()[0]), self.p_index, self.n, self.n_start, self.n_end, scope="p_func", reuse=reuse)

            # wrap parameters in distribution and sample
            act_pd = act_pdtype_n[self.p_index].pdfromflat(p)
            act_soft_sample = act_pd.sample(noise=False)
            # print(act_soft_sample)
            act_onehot = tf.multinomial(act_soft_sample[-1,:,:], 1)
            # print(act_onehot)
            value_out = tf.squeeze(value, axis=0)  # remove the time dimension from the output for storing in the buffer

            return_ph_expd = tf.expand_dims(return_ph, axis=-1)
            # Value Network Optimization
            # value = tf.squeeze(value, axis=-1)  # remove the last single out dim, to align with return (#trajlen, #batch)
            target = return_ph_expd - value
            loss_v = tf.reduce_mean(tf.math.squared_difference(value, return_ph_expd))
            optim_v = self.optimizer.minimize(loss_v, name='adam_optim_v')

            # Policy Network Optimization
            # print(act_soft_sample)
            target_pi = tf.squeeze(target, axis=-1)
            loss_pi = tf.reduce_mean(tf.stop_gradient(target_pi) * tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=p, labels=act_onehot_ph), name='loss_pi')
            optim_pi = self.optimizer.minimize(loss_pi, name='adam_optim_pi')

            # Create callable functions
            # policy network
            # Use sess.run to the feed the dictionary, since we are not calling it anywhere else, simi
            update_pi = optim_pi
            update_v = optim_v
            train_v = U.function(inputs=p_input + [return_ph], outputs=update_v)
            train_pi = U.function(inputs=p_input + [act_onehot_ph] + [return_ph], outputs=update_pi)
            act = U.function(inputs=p_input, outputs=[act_onehot, act_soft_sample, enc_state, memory_state, attention, value_out])

            return act, train_pi, train_v
def make_update_exp(vals, target_vals, polyak):
    polyak = 1.0 - polyak
    expression = []
    for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
        expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var))
    expression = tf.group(*expression)
    return U.function([], [], updates=[expression])
Exemple #3
0
    def _qMA_train(self,
                   critic_index,
                   make_obs_ph_n,
                   make_q_gru_ph_n,
                   make_act_ph_n,
                   make_target_ph,
                   importance_in,
                   q_func,
                   optimizer,
                   grad_norm_clipping=None,
                   scope="trainer",
                   reuse=None):
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # set up placeholders
            obs_ph_n = make_obs_ph_n
            q_gru_ph_n = make_q_gru_ph_n
            act_ph_n = make_act_ph_n
            target_ph = make_target_ph

            q_input = self._q_setup_placeholder(obs_ph_n,
                                                q_gru_ph_n[self.p_index],
                                                act_ph_n)

            q, q_gru_state = q_func(q_input,
                                    self.n,
                                    self.args,
                                    scope="q_func" + str(self.p_index) +
                                    str(critic_index),
                                    reuse=reuse,
                                    p_index=self.p_index)
            q_func_vars = U.scope_vars(
                U.absolute_scope_name("q_func" + str(self.p_index) +
                                      str(critic_index)))
            q = q[:, :, 0]

            q_error = q - target_ph
            if self.args.PER_sampling:
                q_loss = tf.reduce_mean(
                    tf.multiply(tf.square(q_error), importance_in))
            else:
                q_loss_t = tf.reduce_sum(tf.square(q_error),
                                         axis=1) / tf.to_float(
                                             self.args.len_traj_update)
                q_loss = tf.reduce_sum(q_loss_t, axis=0) / tf.to_float(
                    self.args.batch_size)
            # viscosity solution to Bellman differential equation in place of an initial condition
            q_reg = tf.reduce_mean(tf.square(q))
            loss = q_loss + 1e-3 * q_reg

            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)

            # Create callable functions
            if self.args.PER_sampling:
                train = U.function(inputs=q_input + [importance_in] +
                                   [target_ph],
                                   outputs=[loss, q_error],
                                   updates=[optimize_expr])
            else:
                train = U.function(inputs=q_input + [target_ph],
                                   outputs=[loss, q_error],
                                   updates=[optimize_expr])
            q_values = U.function(q_input, [q, q_gru_state])

            # target network
            target_q, t_q_gru_state = q_func(
                q_input,
                self.n,
                self.args,
                scope="target_q_func" + str(self.p_index) + str(critic_index),
                reuse=reuse,
                p_index=self.p_index)
            target_q = target_q[:, :, 0]
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func" + str(self.p_index) +
                                      str(critic_index)))
            update_target_q = make_update_exp(q_func_vars, target_q_func_vars,
                                              self.args.polyak)
            target_q_values = U.function(q_input, [target_q, t_q_gru_state])

            return train, update_target_q, {
                'q_values': q_values,
                'target_q_values': target_q_values
            }
Exemple #4
0
    def _pMA_train(self,
                   make_obs_ph_n,
                   make_memory_ph_n,
                   make_q_gru_ph_n,
                   make_h_ph_n,
                   make_c_ph_n,
                   make_act_ph_n,
                   action_space_n,
                   importance_in,
                   p_func,
                   q_func,
                   optimizer,
                   grad_norm_clipping=None,
                   scope="agent",
                   reuse=None):
        with tf.compat.v1.variable_scope(scope, reuse=reuse):
            # create distributions
            act_pdtype_n = [
                make_pdtype(act_space, self.args.env_type)
                for act_space in action_space_n
            ]

            # set up placeholders
            obs_ph_n = make_obs_ph_n
            memory_ph_n = make_memory_ph_n
            h_ph_n = make_h_ph_n
            c_ph_n = make_c_ph_n
            act_ph_n = make_act_ph_n
            q_gru_ph = make_q_gru_ph_n[self.p_index]

            # Feed all inputs. Let the model decide what to choose.
            p_input = self._p_setup_placeholder(obs_ph_n, h_ph_n, c_ph_n,
                                                memory_ph_n, q_gru_ph)
            p, enc_state, memory_state, attention = p_func(
                p_input,
                int(act_pdtype_n[self.p_index].param_shape()[0]),
                self.p_index,
                self.n,
                self.n_start,
                self.n_end,
                scope="p_func",
                reuse=reuse)
            # Get parent/relative scope of the policy function
            p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

            # wrap parameters in distribution
            act_pd = act_pdtype_n[self.p_index].pdfromflat(p)

            if not (self.args.benchmark or self.args.display):
                act_sample = act_pd.sample(
                )  # Add gumbel noise to prediction for regularization
            else:
                act_sample = act_pd.sample(
                    noise=False)  # only softmax, no noise

            # Calculate loss
            # p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
            p_reg_t = tf.reduce_sum(tf.square(act_pd.flatparam()),
                                    axis=1) / tf.to_float(
                                        self.args.len_traj_update)
            p_reg = tf.reduce_sum(p_reg_t, axis=0) / tf.to_float(
                self.args.batch_size)

            act_input_n = act_ph_n + []
            # Use Gumbel Out for calculating policy loss
            act_input_n[self.p_index] = act_pd.sample()
            q_input = self._qp_setup_placeholder(p_input, act_input_n)

            q, state = q_func(q_input,
                              self.n,
                              self.args,
                              scope="q_func" + str(self.p_index) + "1",
                              reuse=True,
                              p_index=self.p_index)
            q = q[:, :, 0]
            # Calculate policy loss
            # pg_loss = -tf.reduce_mean(q)
            pg_loss_t = tf.reduce_sum(q, axis=1) / tf.to_float(
                self.args.len_traj_update)
            pg_loss = -tf.reduce_sum(pg_loss_t, axis=0) / tf.to_float(
                self.args.batch_size)
            loss = pg_loss + p_reg * 1e-3
            optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                                grad_norm_clipping)

            # Create callable functions
            # policy network
            train = U.function(inputs=p_input + act_ph_n,
                               outputs=loss,
                               updates=[optimize_expr])
            act = U.function(
                inputs=p_input,
                outputs=[act_sample, enc_state, memory_state, attention])
            p_values = U.function(p_input, p)

            # target network (Use one hot for discrete)
            target_p, t_enc_state, target_memory, _ = p_func(
                p_input,
                int(act_pdtype_n[self.p_index].param_shape()[0]),
                self.p_index,
                self.n,
                self.n_start,
                self.n_end,
                scope="target_p_func",
                reuse=reuse)
            target_p_func_vars = U.scope_vars(
                U.absolute_scope_name("target_p_func"))
            update_target_p = make_update_exp(p_func_vars, target_p_func_vars,
                                              self.polyak)

            # if self.args.env_type == "ic3net": noise_target = False
            target_act_sample = act_pdtype_n[self.p_index].pdfromflat(
                target_p).sample(noise=True)
            target_act = U.function(
                inputs=p_input,
                outputs=[target_act_sample, t_enc_state, target_memory])

            return act, train, update_target_p, {
                'p_values': p_values,
                'target_act': target_act
            }