Ejemplo n.º 1
0
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0):
    """
    get the actor update, with noise.

    :param actor: (str) the actor
    :param perturbed_actor: (str) the pertubed actor
    :param param_noise_stddev: (float) the std of the parameter noise
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :return: (TensorFlow Operation) the update function
    """
    # TODO: simplify this to this:
    # assert len(actor.vars) == len(perturbed_actor.vars)
    # assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars)

    assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor))
    assert len([var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]) == \
        len([var for var in tf_util.get_trainable_vars(perturbed_actor) if 'LayerNorm' not in var.name])

    updates = []
    for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)):
        if var in [var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]:
            if verbose >= 2:
                logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var,
                                     var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
        else:
            if verbose >= 2:
                logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var, var))
    assert len(updates) == len(tf_util.get_globals_vars(actor))
    return tf.group(*updates)
Ejemplo n.º 2
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(
                        self.observation_space,
                        self.action_space,
                        self.hidden_size_adversary,
                        entcoeff=self.adversary_entcoeff)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_fn[:, 0] - ret))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leiber', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(
                            self.reward_giver.get_trainable_variables(),
                            sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = find_trainable_variables("model")
                if self.using_gail:
                    self.params.extend(
                        self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Ejemplo n.º 3
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.compat.v1.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    # Target advantage function (if applicable)
                    atarg = tf.compat.v1.placeholder(dtype=tf.float32,
                                                     shape=[None])

                    # Empirical return
                    ret = tf.compat.v1.placeholder(dtype=tf.float32,
                                                   shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.compat.v1.placeholder(name='lrmult',
                                                      dtype=tf.float32,
                                                      shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(input_tensor=kloldnew)
                    meanent = tf.reduce_mean(input_tensor=ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action_ph) -
                        old_pi.proba_distribution.logp(action_ph))

                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(
                        input_tensor=tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        input_tensor=tf.square(self.policy_pi.value_flat -
                                               ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.compat.v1.summary.scalar('entropy_loss', pol_entpen)
                    tf.compat.v1.summary.scalar('policy_gradient_loss',
                                                pol_surr)
                    tf.compat.v1.summary.scalar('value_function_loss', vf_loss)
                    tf.compat.v1.summary.scalar('approximate_kullback-leibler',
                                                meankl)
                    tf.compat.v1.summary.scalar('clip_factor', clip_param)
                    tf.compat.v1.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.compat.v1.assign(oldv, newv)
                            for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.compat.v1.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar(
                        'discounted_rewards', tf.reduce_mean(input_tensor=ret))
                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.optim_stepsize))
                    tf.compat.v1.summary.scalar(
                        'advantage', tf.reduce_mean(input_tensor=atarg))
                    tf.compat.v1.summary.scalar(
                        'clip_range',
                        tf.reduce_mean(input_tensor=self.clip_param))

                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards',
                                                       ret)
                        tf.compat.v1.summary.histogram('learning_rate',
                                                       self.optim_stepsize)
                        tf.compat.v1.summary.histogram('advantage', atarg)
                        tf.compat.v1.summary.histogram('clip_range',
                                                       self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation', obs_ph)
                        else:
                            tf.compat.v1.summary.histogram(
                                'observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.compat.v1.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
    def setup_model(self):
        # prevent import loops

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            print("number of workers are", self.nworkers)
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                self._setup_learn(self.seed)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)
                # Network for phi
                with tf.variable_scope("phi", reuse=False):
                    self.policy_phi = self.policy(self.sess,
                                                  self.observation_space,
                                                  self.action_space,
                                                  self.n_envs,
                                                  1,
                                                  None,
                                                  reuse=False,
                                                  **self.policy_kwargs)
                # Network for phi old
                with tf.variable_scope("oldphi", reuse=False):
                    self.policy_phi_old = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      None,
                                                      reuse=False,
                                                      **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    #kloldnew = self.policy_pi.proba_distribution.kl(old_policy.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    vf_phi_err = tf.reduce_mean(
                        tf.square(self.policy_phi.value_flat - ret))
                    vf_phi_old_err = tf.reduce_mean(
                        tf.square(self.policy_phi_old.value_flat))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]
                    all_var_oldpi_list = tf_util.get_trainable_vars("oldpi")
                    var_oldpi_list = [
                        v for v in all_var_oldpi_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]

                    all_var_phi_list = tf_util.get_trainable_vars("phi")
                    vf_phi_var_list = [
                        v for v in all_var_phi_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    all_var_phi_old_list = tf_util.get_trainable_vars("oldphi")
                    vf_phi_old_var_list = [
                        v for v in all_var_phi_old_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    #print("vars", vf_var_list)
                    self.policy_vars = all_var_list
                    self.oldpolicy_vars = all_var_oldpi_list
                    print("all var list", all_var_list)
                    print("phi vars", vf_phi_var_list)
                    print("phi old vars", vf_phi_old_var_list)

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))
                    self.compute_vf_phi_lossandgrad = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret],
                        tf_util.flatgrad(vf_phi_err, vf_phi_var_list))
                    self.compute_vf_loss = tf_util.function(
                        [observation, old_policy.obs_ph, ret], vferr)
                    self.compute_vf_phi_loss = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret], vf_phi_err)
                    #self.compute_vf_phi_old_loss = tf_util.function([self.policy_phi_old.obs_ph], vf_phi_old_err)
                    #self.phi_old_obs = np.array([-0.012815  , -0.02076313,  0.07524705,  0.09407324,  0.0901745 , -0.09339058,  0.03544853, -0.03297224])
                    #self.phi_old_obs = self.phi_old_obs.reshape((1, 8))

                    update_phi_old_expr = []
                    for var, var_target in zip(
                            sorted(vf_phi_var_list, key=lambda v: v.name),
                            sorted(vf_phi_old_var_list, key=lambda v: v.name)):
                        update_phi_old_expr.append(var_target.assign(var))
                    update_phi_old_expr = tf.group(*update_phi_old_expr)

                    self.update_phi_old = tf_util.function(
                        [], [], updates=[update_phi_old_expr])

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    @contextmanager
                    def temp_seed(seed):
                        state = np.random.get_state()
                        np.random.seed(seed)
                        try:
                            yield
                        finally:
                            np.random.set_state(state)

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    self.vf_phi_adam = MpiAdam(vf_phi_var_list, sess=self.sess)
                    self.vfadam.sync()
                    self.vf_phi_adam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                self.timed = timed
                self.allmean = allmean
                self.temp_seed = temp_seed

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars(
                    "model") + tf_util.get_trainable_vars("oldpi")

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Ejemplo n.º 5
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)

                # Penalty related variable
                with tf.variable_scope('penalty'):
                    cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost

                    param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8))
                    penalty_param = tf.get_variable('penalty_param',
                                                    initializer=float(param_init),
                                                    trainable=True,
                                                    dtype=tf.float32)
                penalty = tf.nn.softplus(penalty_param)
                penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim))

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)
                
                # # Network for safety value function
                # with tf.variable_Scope("vc",reuse=False):
                #     self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None)
                
                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function
                    cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret))
                    vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret))
                    
                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) -
                                   old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)
                    # Surrogate for cost function
                    surrcost = tf.reduce_mean(ratio * catarg)

                    optimgain = surrgain + entbonus
                    # Include surr_cost in pi_objective
                    optimgain -= penalty * surrcost
                    optimgain /= (1 + penalty)
                    # # Loss function for pi is negative of pi_objective
                    # optimgain = -optimgain # Should we??
                    
                    losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters
                    vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    # Fisher vector products
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('penalty_loss', penalty_loss)
                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('constraint_cost_function_loss', surrcost)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg],
                                                        fvp) # Why need all inputs? Might for implementation easiness
                    # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret],
                    #                                               tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used
                    # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret],
                    #                                               tf_util.flatgrad(vcerr, vcf_var_list))
                    self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret],
                                                                  [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)])
                    self.compute_lagrangiangrad = tf_util.function([cur_cost_ph],
                                                                   tf_util.flatgrad(penalty_loss, [penalty_param]))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                                           color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()
                    # optimizer for constraint costs value function
                    self.vcadam = MpiAdam(vcf_var_list, sess=self.sess)
                    self.vcadam.sync()
                    # optimizer for lagragian value of safe RL
                    self.penaltyadam = MpiAdam([penalty_param], sess=self.sess)
                    self.penaltyadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret))
                    tf.summary.scalar('discounted_costs', tf.reduce_mean(cret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('discounted_rewards', cret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('penalty_learning_rate', self.penalty_lr)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('cost_advantage', catarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Ejemplo n.º 6
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="TRPO"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            with self.sess.as_default():
                seg_gen = traj_segment_generator(
                    self.policy_pi,
                    self.env,
                    self.timesteps_per_batch,
                    reward_giver=self.reward_giver,
                    gail=self.using_gail)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                lenbuffer = deque(
                    maxlen=40)  # rolling buffer for episode lengths
                rewbuffer = deque(
                    maxlen=40)  # rolling buffer for episode rewards
                self.episode_reward = np.zeros((self.n_envs, ))

                true_rewbuffer = None
                if self.using_gail:
                    true_rewbuffer = deque(maxlen=40)
                    #  Stats not used for now
                    #  g_loss_stats = Stats(loss_names)
                    #  d_loss_stats = Stats(reward_giver.loss_name)
                    #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])

                    # if provide pretrained weight
                    if self.pretrained_weight is not None:
                        tf_util.load_state(
                            self.pretrained_weight,
                            var_list=tf_util.get_globals_vars("pi"),
                            sess=self.sess)

                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) == False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    def fisher_vector_product(vec):
                        return self.allmean(
                            self.compute_fvp(
                                vec, *fvpargs,
                                sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for k in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()
                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action, atarg, tdlamret = seg["ob"], seg[
                            "ac"], seg["adv"], seg["tdlamret"]
                        vpredbefore = seg[
                            "vpred"]  # predicted value function before udpate
                        atarg = (atarg - atarg.mean()) / atarg.std(
                        )  # standardized advantage function estimate

                        # true_rew is the reward without discount
                        if writer is not None:
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, seg["true_rew"].reshape(
                                    (self.n_envs, -1)), seg["dones"].reshape(
                                        (self.n_envs, -1)), writer,
                                timesteps_so_far)

                        args = seg["ob"], seg["ob"], seg["ac"], atarg
                        fvpargs = [arr[::5] for arr in args]

                        self.assign_old_eq_new(sess=self.sess)

                        with self.timed("computegrad"):
                            steps = timesteps_so_far + (k + 1) * (
                                seg["total_timestep"] / self.g_step)
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            # run loss backprop with summary, and save the metadata (memory, compute time, ...)
                            if writer is not None:
                                summary, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)
                                writer.add_run_metadata(
                                    run_metadata, 'step%d' % steps)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)

                        lossbefore = self.allmean(np.array(lossbefore))
                        grad = self.allmean(grad)
                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            with self.timed("cg"):
                                stepdir = conjugate_gradient(
                                    fisher_vector_product,
                                    grad,
                                    cg_iters=self.cg_iters,
                                    verbose=self.rank == 0
                                    and self.verbose >= 1)
                            assert np.isfinite(stepdir).all()
                            shs = .5 * stepdir.dot(
                                fisher_vector_product(stepdir))
                            # abs(shs) to avoid taking square root of negative values
                            lagrange_multiplier = np.sqrt(
                                abs(shs) / self.max_kl)
                            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                            fullstep = stepdir / lagrange_multiplier
                            expectedimprove = grad.dot(fullstep)
                            surrbefore = lossbefore[0]
                            stepsize = 1.0
                            thbefore = self.get_flat()
                            thnew = None
                            for _ in range(10):
                                thnew = thbefore + fullstep * stepsize
                                self.set_from_flat(thnew)
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(
                                        self.compute_losses(*args,
                                                            sess=self.sess)))
                                improve = surr - surrbefore
                                logger.log("Expected: %.3f Actual: %.3f" %
                                           (expectedimprove, improve))
                                if not np.isfinite(mean_losses).all():
                                    logger.log(
                                        "Got non-finite value of losses -- bad!"
                                    )
                                elif kl_loss > self.max_kl * 1.5:
                                    logger.log(
                                        "violated KL constraint. shrinking step."
                                    )
                                elif improve < 0:
                                    logger.log(
                                        "surrogate didn't improve. shrinking step."
                                    )
                                else:
                                    logger.log("Stepsize OK!")
                                    break
                                stepsize *= .5
                            else:
                                logger.log("couldn't compute a good step")
                                self.set_from_flat(thbefore)
                            if self.nworkers > 1 and iters_so_far % 20 == 0:
                                # list of tuples
                                paramsums = MPI.COMM_WORLD.allgather(
                                    (thnew.sum(), self.vfadam.getflat().sum()))
                                assert all(
                                    np.allclose(ps, paramsums[0])
                                    for ps in paramsums[1:])

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                for (mbob, mbret) in dataset.iterbatches(
                                    (seg["ob"], seg["tdlamret"]),
                                        include_final_partial_batch=False,
                                        batch_size=128):
                                    grad = self.allmean(
                                        self.compute_vflossandgrad(
                                            mbob, mbob, mbret, sess=self.sess))
                                    self.vfadam.update(grad, self.vf_stepsize)

                    for (loss_name, loss_val) in zip(self.loss_names,
                                                     mean_losses):
                        logger.record_tabular(loss_name, loss_val)

                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    if self.using_gail:
                        # ------------------ Update D ------------------
                        logger.log("Optimizing Discriminator...")
                        logger.log(fmt_row(13, self.reward_giver.loss_name))
                        ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                            len(observation))
                        batch_size = len(observation) // self.d_step
                        d_losses = [
                        ]  # list of tuples, each of which gives the loss for a minibatch
                        for ob_batch, ac_batch in dataset.iterbatches(
                            (observation, action),
                                include_final_partial_batch=False,
                                batch_size=batch_size):
                            ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                                len(ob_batch))
                            # update running mean/std for reward_giver
                            if hasattr(self.reward_giver, "obs_rms"):
                                self.reward_giver.obs_rms.update(
                                    np.concatenate((ob_batch, ob_expert), 0))
                            *newlosses, grad = self.reward_giver.lossandgrad(
                                ob_batch, ac_batch, ob_expert, ac_expert)
                            self.d_adam.update(self.allmean(grad),
                                               self.d_stepsize)
                            d_losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

                        lrlocal = (seg["ep_lens"], seg["ep_rets"],
                                   seg["ep_true_rets"])  # local values
                        listoflrpairs = MPI.COMM_WORLD.allgather(
                            lrlocal)  # list of tuples
                        lens, rews, true_rets = map(flatten_lists,
                                                    zip(*listoflrpairs))
                        true_rewbuffer.extend(true_rets)
                    else:
                        lrlocal = (seg["ep_lens"], seg["ep_rets"]
                                   )  # local values
                        listoflrpairs = MPI.COMM_WORLD.allgather(
                            lrlocal)  # list of tuples
                        lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)

                    logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    if self.using_gail:
                        logger.record_tabular("EpTrueRewMean",
                                              np.mean(true_rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    timesteps_so_far += seg["total_timestep"]
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()

        return self
Ejemplo n.º 7
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.grad_inverter = grad_inverter(
                        [self.action_space.high, self.action_space.low])
                    # Target advantage function (if applicable)
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                    # Empirical return
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.placeholder(name='lrmult',
                                            dtype=tf.float32,
                                            shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    if debug:
                        action_ph_val = tf.Print(
                            action_ph, [
                                action_ph,
                            ],
                            '\n\n ====================Unclipped action in: \n',
                            summarize=-1)
                        action_ph_val = tf.Print(
                            action_ph_val, [],
                            '\n ======================================== \n',
                            summarize=-1)

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    # old_logstd = old_pi.proba_distribution.logstd
                    # new_logstd = self.policy_pi.proba_distribution.logstd
                    # old_std = old_pi.proba_distribution.std
                    # new_std = self.policy_pi.proba_distribution.std
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    # meankl = tf.Print(meankl, [meankl,], "kl value: ")
                    # meankl_log = tf.Print(meankl, [old_logstd,], "high kl, old logstd value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [new_logstd,], "high kl, new logstd value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [old_std,], "high kl, old std value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [new_std,], "high kl, new std value: ", summarize=-1)
                    # meanklvalue_ = tf.where(
                    #     tf.greater(meankl, tf.constant(1, dtype = tf.float32)),
                    #     meankl_log,
                    #     meankl)
                    meanent = tf.reduce_mean(ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    if debug:
                        old_logp = old_pi.proba_distribution.logp(
                            action_ph_val)
                        old_mean = old_pi.proba_distribution.mode()
                        old_std = old_pi.proba_distribution.std
                        old_logp = tf.Print(old_logp, [
                            old_logp,
                        ],
                                            '======  OLD logp: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        old_logp = tf.Print(old_logp, [
                            old_mean,
                        ],
                                            '======  OLD mean: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        old_logp = tf.Print(old_logp, [
                            old_std,
                        ],
                                            '======  OLD std: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        now_logp = self.policy_pi.proba_distribution.logp(
                            action_ph_val)
                        now_mean = self.policy_pi.proba_distribution.mode()
                        now_std = self.policy_pi.proba_distribution.std
                        now_logp = tf.Print(now_logp, [
                            now_logp,
                        ],
                                            '======  NOW logp: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                        now_logp = tf.Print(now_logp, [
                            now_mean,
                        ],
                                            '======  NOW mean: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                        now_logp = tf.Print(now_logp, [
                            now_std,
                        ],
                                            '======  NOW std: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                    else:
                        now_logp = self.policy_pi.proba_distribution.logp(
                            action_ph)
                        old_logp = old_pi.proba_distribution.logp(action_ph)

                    ratio = tf.exp(now_logp - old_logp)
                    if debug:
                        ratio = tf.Print(ratio, [
                            ratio,
                        ],
                                         'ratio: \n',
                                         summarize=-1)
                        ratio = tf.Print(ratio, [], '\n', summarize=-1)
                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    # losses = [pol_surr, pol_entpen, vf_loss, meanklvalue_, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.summary.scalar('entropy_loss', pol_entpen)
                    tf.summary.scalar('policy_gradient_loss', pol_surr)
                    tf.summary.scalar('value_function_loss', vf_loss)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('clip_factor', clip_param)
                    tf.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.optim_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_param))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate',
                                             self.optim_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('clip_range', self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', obs_ph)
                        else:
                            tf.summary.histogram('observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
Ejemplo n.º 8
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                # Construct network for new policy
                with tf.variable_scope("pi", reuse=False):
                    self.policy_pi = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 self.n_envs,
                                                 1,
                                                 None,
                                                 reuse=False)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False)

                # Target advantage function (if applicable)
                atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                # Empirical return
                ret = tf.placeholder(dtype=tf.float32, shape=[None])

                # learning rate multiplier, updated with schedule
                lrmult = tf.placeholder(name='lrmult',
                                        dtype=tf.float32,
                                        shape=[])

                # Annealed cliping parameter epislon
                clip_param = self.clip_param * lrmult

                obs_ph = self.policy_pi.obs_ph
                action_ph = self.policy_pi.pdtype.sample_placeholder([None])

                kloldnew = old_pi.proba_distribution.kl(
                    self.policy_pi.proba_distribution)
                ent = self.policy_pi.proba_distribution.entropy()
                meankl = tf.reduce_mean(kloldnew)
                meanent = tf.reduce_mean(ent)
                pol_entpen = (-self.entcoeff) * meanent

                # pnew / pold
                ratio = tf.exp(
                    self.policy_pi.proba_distribution.logp(action_ph) -
                    old_pi.proba_distribution.logp(action_ph))

                # surrogate from conservative policy iteration
                surr1 = ratio * atarg
                surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                         1.0 + clip_param) * atarg

                # PPO's pessimistic surrogate (L^CLIP)
                pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                vf_loss = tf.reduce_mean(
                    tf.square(self.policy_pi.value_fn[:, 0] - ret))
                total_loss = pol_surr + pol_entpen + vf_loss
                losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                self.loss_names = [
                    "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                ]

                self.params = tf_util.get_trainable_vars("pi")
                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses + [tf_util.flatgrad(total_loss, self.params)])
                self.adam = MpiAdam(self.params,
                                    epsilon=self.adam_epsilon,
                                    sess=self.sess)

                self.assign_old_eq_new = tf_util.function(
                    [], [],
                    updates=[
                        tf.assign(oldv, newv) for (
                            oldv,
                            newv) in zipsame(tf_util.get_globals_vars("oldpi"),
                                             tf_util.get_globals_vars("pi"))
                    ])
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)
Ejemplo n.º 9
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier
        from stable_baselines.mdal.adversary import TabularAdversaryTF, NeuralAdversaryTRPO


        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the MDPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                # self._setup_learn(self.seed)
                self._setup_learn()

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)
                elif self.using_mdal:
                    if self.neural:
                        self.reward_giver = NeuralAdversaryTRPO(self.sess, self.observation_space, self.action_space,
                                                                self.hidden_size_adversary,
                                                                entcoeff=self.adversary_entcoeff)
                    else:
                        self.reward_giver = TabularAdversaryTF(self.sess, self.observation_space, self.action_space,
                                                                 self.hidden_size_adversary,
                                                                 entcoeff=self.adversary_entcoeff,
                                                                 expert_features=self.expert_dataset.successor_features,
                                                                 exploration_bonus=self.exploration_bonus,
                                                                 bonus_coef=self.bonus_coef,
                                                                 t_c=self.t_c,
                                                                 is_action_features=self.is_action_features)
                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    self.old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for fitting closed form
                with tf.variable_scope("closedpi", reuse=False):
                    self.closed_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    self.vtarg = tf.placeholder(dtype=tf.float32, shape=[None])
                    self.ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="learning_rate_ph")
                    self.outer_learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="outer_learning_rate_ph")
                    self.old_vpred_ph = tf.placeholder(dtype=tf.float32, shape=[None], name="old_vpred_ph")
                    self.clip_range_vf_ph = tf.placeholder(dtype=tf.float32, shape=[], name="clip_range_ph")

                    observation = self.policy_pi.obs_ph
                    self.action = self.policy_pi.pdtype.sample_placeholder([None])

                    if self.tsallis_q == 1.0:
                        kloldnew = self.policy_pi.proba_distribution.kl(self.old_policy.proba_distribution)
                        ent = self.policy_pi.proba_distribution.entropy()
                        meankl = tf.reduce_mean(kloldnew)

                    else:
                        logp_pi = self.policy_pi.proba_distribution.logp(self.action)
                        logp_pi_old =  self.old_policy.proba_distribution.logp(self.action)
                        ent = self.policy_pi.proba_distribution.entropy()
                        #kloldnew = self.policy_pi.proba_distribution.kl_tsallis(self.old_policy.proba_distribution, self.tsallis_q)
                        tsallis_q = 2.0 - self.tsallis_q
                        meankl = tf.reduce_mean(tf_log_q(tf.exp(logp_pi), tsallis_q) - tf_log_q(tf.exp(logp_pi_old), tsallis_q)) #tf.reduce_mean(kloldnew)

                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    if self.cliprange_vf is None:
                        vpred_clipped = self.policy_pi.value_flat
                    else:
                        vpred_clipped = self.old_vpred_ph + \
                            tf.clip_by_value(self.policy_pi.value_flat - self.old_vpred_ph,
                                             - self.clip_range_vf_ph, self.clip_range_vf_ph)

                    vf_losses1 = tf.square(self.policy_pi.value_flat - self.ret)
                    vf_losses2 = tf.square(vpred_clipped - self.ret)
                    vferr = tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(self.action) -
                                   self.old_policy.proba_distribution.logp(self.action))

                    if self.method == "multistep-SGD":
                        surrgain = tf.reduce_mean(ratio * self.atarg) - meankl / self.learning_rate_ph
                    elif self.method == "closedreverse-KL":
                        surrgain = tf.reduce_mean(tf.exp(self.atarg) * self.policy_pi.proba_distribution.logp(self.action))
                    else:
                        policygain = tf.reduce_mean(tf.exp(self.atarg) * tf.log(self.closed_policy.proba_distribution.mean))
                        surrgain = tf.reduce_mean(ratio * self.atarg) - tf.reduce_mean(self.learning_rate_ph * ratio * self.policy_pi.proba_distribution.logp(self.action))

                    optimgain = surrgain #+ entbonus - self.learning_rate_ph * meankl
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name]
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name]
                    print("policy vars", var_list)

                    all_closed_var_list = tf_util.get_trainable_vars("closedpi")
                    closed_var_list = [v for v in all_closed_var_list if "/vf" not in v.name and "/q" not in v.name]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    # tf.summary.scalar('entropy_loss', meanent)
                    # tf.summary.scalar('policy_gradient_loss', optimgain)
                    # tf.summary.scalar('value_function_loss', surrgain)
                    # tf.summary.scalar('approximate_kullback-leibler', meankl)
                    # tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.learning_rate_ph, self.vtarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, self.old_policy.obs_ph, self.action, self.atarg],
                                                        fvp)
                    self.compute_vflossandgrad = tf_util.function([observation, self.old_policy.obs_ph, self.ret, self.old_vpred_ph, self.clip_range_vf_ph],
                                                                  tf_util.flatgrad(vferr, vf_var_list))

                    grads = tf.gradients(-optimgain, var_list)
                    grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5)
                    trainer = tf.train.AdamOptimizer(learning_rate=self.outer_learning_rate_ph, epsilon=1e-5)
                    # trainer = tf.train.AdamOptimizer(learning_rate=3e-4, epsilon=1e-5)
                    grads = list(zip(grads, var_list))
                    self._train = trainer.apply_gradients(grads)

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            # print(colorize(msg, color='magenta'))
                            # start_time = time.time()
                            yield
                            # print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                            #                color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail or self.using_mdal:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.ret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(self.atarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', self.ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', self.atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.ret, self.learning_rate_ph, self.vtarg, self.closed_policy.obs_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Ejemplo n.º 10
0
def general_actor_critic(input_shape_vec,
                         act_output_shape,
                         comm,
                         learn_rate=[0.001, 0.001],
                         trainable=True,
                         label=""):

    sess = K.get_session()
    np.random.seed(0)
    tf.set_random_seed(0)

    # network 1 (new policy)
    with tf.variable_scope(label + "_pi_new", reuse=False):
        inp = Input(shape=input_shape_vec)  # [5,6,3]
        # rc_lyr = Lambda(lambda x:  ned_to_ripCoords_tf(x, 4000))(inp)
        trunk_x = Reshape([input_shape_vec[0], input_shape_vec[1] * 3])(inp)
        trunk_x = LSTM(128)(trunk_x)
        dist, sample_action_op, action_ph, value_output = ppo_continuous(
            3, trunk_x)

    # network 2 (old policy)
    with tf.variable_scope(label + "_pi_old", reuse=False):
        inp_old = Input(shape=input_shape_vec)  # [5,6,3]
        # rc_lyr = Lambda(lambda x:  ned_to_ripCoords_tf(x, 4000))(inp_old)
        trunk_x = Reshape([input_shape_vec[0],
                           input_shape_vec[1] * 3])(inp_old)
        trunk_x = LSTM(128)(trunk_x)
        dist_old, sample_action_op_old, action_ph_old, value_output_old = ppo_continuous(
            3, trunk_x)

    # additional placeholders
    adv_ph = tf.placeholder(tf.float32, [None], name="advantages_ph")
    alpha_ph = tf.placeholder(tf.float32, shape=(), name="alpha_ph")
    vtarg = tf.placeholder(tf.float32, [None])  # target value placeholder

    # loss
    loss = ppo_continuous_loss(dist, dist_old, value_output, action_ph,
                               alpha_ph, adv_ph, vtarg)

    # gradient
    with tf.variable_scope("grad", reuse=False):
        gradient = tf_util.flatgrad(
            loss, tf_util.get_trainable_vars(label + "_pi_new"))
        adam = MpiAdam(tf_util.get_trainable_vars(label + "_pi_new"),
                       epsilon=0.00001,
                       sess=sess,
                       comm=comm)

    # method for sync'ing the two policies
    assign_old_eq_new = tf_util.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(tf_util.get_globals_vars(label + "_pi_old"),
                                  tf_util.get_globals_vars(label + "_pi_new"))
        ])

    # initialize all the things
    init_op = tf.global_variables_initializer()
    sess.run(init_op)

    # methods for interacting with this model

    def sync_weights():
        assign_old_eq_new(sess=sess)

    def sample_action(states, logstd_override=None):
        a = sess.run(sample_action_op, feed_dict={inp: states})
        return a

    def sample_value(states):
        v = sess.run(value_output, feed_dict={inp: states})
        return v

    def train(states, actions, vtarget, advs, alpha):
        alpha = max(alpha, 0.0)
        adam_lr = learn_rate[0]

        g = sess.run(
            [gradient],
            feed_dict={
                inp: states,
                inp_old: states,
                action_ph: actions,
                adv_ph: advs,
                alpha_ph: alpha,
                vtarg: vtarget
            })

        adam.update(g[0], adam_lr * alpha)

    # initial sync
    adam.sync()
    sync_weights()

    return sync_weights, sample_action, sample_value, train