Ejemplo n.º 1
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 gamma=0.99,
                 sil_update=1,
                 fn_reward=None,
                 fn_obs=None,
                 sil_value=0.01,
                 sil_alpha=0.6,
                 sil_beta=0.1,
                 sil_loss=0.1,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 ppo=True,
                 prev_pi=None,
                 silm=None):

        self.sess = sess = get_session()
        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        if ppo == True:
            with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
                # CREATE OUR TWO MODELS
                # act_model that is used for sampling
                act_model = policy(nbatch_act, 1, sess)

                # Train model for training
                if microbatch_size is None:
                    train_model = policy(nbatch_train, nsteps, sess)
                else:
                    train_model = policy(microbatch_size, nsteps, sess)

                sil_model = policy(None, None, sess=sess)

            # CREATE THE PLACEHOLDERS
            self.A = A = train_model.pdtype.sample_placeholder([None])
            self.ADV = ADV = tf.placeholder(tf.float32, [None])
            self.R = R = tf.placeholder(tf.float32, [None])
            # Keep track of old actor
            self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(
                tf.float32, [None])
            # Keep track of old critic
            self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
            self.LR = LR = tf.placeholder(tf.float32, [])
            # Cliprange
            self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

            neglogpac = train_model.pd.neglogp(A)

            # Calculate the entropy
            # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
            entropy = tf.reduce_mean(train_model.pd.entropy())

            # CALCULATE THE LOSS
            # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

            # Clip the value to reduce variability during Critic training
            # Get the predicted value
            vpred = train_model.vf
            vpredclipped = OLDVPRED + tf.clip_by_value(
                train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE)
            # Unclipped value
            vf_losses1 = tf.square(vpred - R)
            # Clipped value
            vf_losses2 = tf.square(vpredclipped - R)

            vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

            # Calculate ratio (pi current policy / pi old policy)
            ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

            # Defining Loss = - J is equivalent to max J
            pg_losses = -ADV * ratio

            pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                                 1.0 + CLIPRANGE)

            # Final PG loss
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
            approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

            # Total loss
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

            # UPDATE THE PARAMETERS USING LOSS
            # 1. Get the model parameters
            params = tf.trainable_variables('ppo2_model')
            # 2. Build our trainer
            if comm is not None and comm.Get_size() > 1:
                self.trainer = MpiAdamOptimizer(
                    comm,
                    learning_rate=LR,
                    mpi_rank_weight=mpi_rank_weight,
                    epsilon=1e-5)
            else:
                self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                      epsilon=1e-5)
            # 3. Calculate the gradients
            grads_and_var = self.trainer.compute_gradients(loss, params)
            grads, var = zip(*grads_and_var)

            if max_grad_norm is not None:
                # Clip the gradients (normalize)
                grads, _grad_norm = tf.clip_by_global_norm(
                    grads, max_grad_norm)
            grads_and_var = list(zip(grads, var))
            # zip aggregate each gradient with parameters associated
            # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

            self.grads = grads
            self.var = var
            self._train_op = self.trainer.apply_gradients(grads_and_var)
            self.loss_names = [
                'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                'clipfrac'
            ]
            self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

            self.train_model = train_model
            self.act_model = act_model
            self.sil_model = sil_model
            self.step = act_model.step
            self.value = act_model.value
            self.initial_state = act_model.initial_state
        else:
            params = tf.trainable_variables(prev_pi)
            self.LR = LR = tf.placeholder(tf.float32, [])
            self.pi = silm
            if comm is not None and comm.Get_size() > 1:
                self.trainer = MpiAdamOptimizer(
                    comm,
                    learning_rate=LR,
                    mpi_rank_weight=mpi_rank_weight,
                    epsilon=1e-5)
            else:
                self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                      epsilon=1e-5)
        '''self.sil = SelfImitation(silm.X, silm.vf,
                                 silm.pd.entropy(), None,
                                 silm.pd.neglogp,
                                 ac_space, fn_reward=fn_reward, fn_obs=fn_obs,
                                 n_env=nbatch_act, n_update=sil_update,
                                 w_value=sil_value,
                                 w_entropy=ent_coef, gamma=gamma,
                                 max_steps=50000, max_nlogp=100,
                                 alpha=sil_alpha, beta=sil_beta)'''
        self.sil = SelfImitation(self.pi.X,
                                 self.pi.vf,
                                 self.pi.pd.entropy(),
                                 None,
                                 self.pi.pd.neglogp,
                                 ac_space,
                                 fn_reward=fn_reward,
                                 fn_obs=fn_obs,
                                 n_env=nbatch_act,
                                 n_update=sil_update,
                                 w_value=sil_value,
                                 w_entropy=ent_coef,
                                 gamma=gamma,
                                 max_steps=50000,
                                 max_nlogp=100,
                                 alpha=sil_alpha,
                                 beta=sil_beta)

        def sil_train(cur_lr):
            #td_map = {self.LR : cur_l}
            return self.sil.train(sess, cur_lr)

        self.sil.set_loss_weight(sil_loss)
        self.sil.build_train_op(params, self.trainer, LR)
        self.sil_train = sil_train

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E1101
Ejemplo n.º 2
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
                 ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
                 alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear',
                 sil_update=4, sil_beta=0.0):

        sess = tf_util.make_session()
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space,
                             nenvs * nsteps, nsteps, reuse=True)
        # sil and train model use the same params as step model?
        sil_model = policy(sess, ob_space, ac_space, nenvs, nsteps, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        value_avg = tf.reduce_mean(train_model.vf)

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(
            learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X: obs, A: actions,
                      ADV: advs, R: rewards, LR: cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, v_avg, _ = sess.run(
                [pg_loss, vf_loss, entropy, value_avg, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy, v_avg

        self.sil = SelfImitation(sil_model.X, sil_model.vf,
                                 sil_model.entropy, sil_model.value, sil_model.neg_log_prob,
                                 ac_space, np.sign, n_env=nenvs, n_update=sil_update, beta=sil_beta)
        self.sil.build_train_op(params, trainer, LR,
                                max_grad_norm=max_grad_norm)

        def sil_train():
            cur_lr = lr.value()
            return self.sil.train(sess, cur_lr)

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.sil_train = sil_train
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 3
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 gamma=0.99,
                 sil_update=1,
                 fn_reward=None,
                 fn_obs=None,
                 sil_value=0.01,
                 sil_alpha=0.6,
                 sil_beta=0.1):
        sess = tf.get_default_session()

        act_model = policy(sess,
                           ob_space,
                           ac_space,
                           nbatch_act,
                           1,
                           reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nbatch_train,
                             nsteps,
                             reuse=True)
        sil_model = policy(sess, ob_space, ac_space, None, None, reuse=True)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        with tf.variable_scope('model'):
            params = tf.trainable_variables()
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = trainer.apply_gradients(grads)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]

        # Self-Imitation learning
        self.sil = SelfImitation(sil_model.X,
                                 sil_model.vf,
                                 sil_model.entropy,
                                 sil_model.value,
                                 sil_model.neg_log_prob,
                                 ac_space,
                                 fn_reward=fn_reward,
                                 fn_obs=fn_obs,
                                 n_env=nbatch_act,
                                 n_update=sil_update,
                                 w_value=sil_value,
                                 w_entropy=ent_coef,
                                 gamma=gamma,
                                 max_steps=50000,
                                 max_nlogp=100,
                                 alpha=sil_alpha,
                                 beta=sil_beta)

        self.sil.set_loss_weight(0.1)
        self.sil.build_train_op(params, trainer, LR)

        def sil_train(cur_lr):
            return self.sil.train(sess, cur_lr)

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)
            # If you want to load weights, also save/load observation scaling inside VecNormalize

        self.train = train
        self.train_model = train_model
        self.sil_train = sil_train
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)  #pylint: disable=E1101