def __init__(self, *, ob, ob_length, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()
        # sequential state

        # sequential action
        decoder_input = tf.placeholder(tf.int32, [None, None])
        action = tf.placeholder(tf.int32, [None, None])
        action_length = tf.placeholder(tf.int32, [None])
        # sequential adv
        adv = tf.placeholder(tf.float32, [None, None])
        # sequential reward
        r = tf.placeholder(tf.float32, [None, None])

        # keep track of old actor(sequential descision)
        oldneglogpac = tf.placeholder(tf.float32, [None, None])
        oldvpred = tf.placeholder(tf.float32, [None, None])
        lr = tf.placeholder(tf.float32, [])

        # Cliprange
        cliprange = tf.placeholder(tf.float32, [])

        train_model = Seq2seqPolicy("pi", hparams, reuse=True, encoder_inputs=ob,
                                     encoder_lengths=ob_length,
                                     decoder_inputs=decoder_input,
                                     decoder_full_length=action_length,
                                     decoder_targets=action)
        act_model = Seq2seqPolicy("oldpi", hparams, reuse=False, encoder_inputs=ob,
                                 encoder_lengths=ob_length,
                                 decoder_inputs=decoder_input,
                                 decoder_full_length=action_length,
                                 decoder_targets=action)

        assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                        for (oldv, newv) in
                                                        zipsame(act_model.get_variables(), train_model.get_variables())])

        # neglogpac = train_model.neglogp()
        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.entropy())
        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training.
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = oldvpred + tf.clip_by_value(train_model.vf - oldvpred, -cliprange, cliprange)

        # Unclipped value
        vf_losses1 = tf.square(vpred - r)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - r)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        #ratio = tf.exp(oldneglogpac - neglogpac)

        ratio = tf.exp(train_model.logp() - act_model.logp())

        # define the loss = -J is equivalent to max J
        pg_losses = -adv * ratio
        pg_losses2 = -adv * tf.clip_by_value(ratio, 1.0 - cliprange, 1.0 + cliprange)

        # Final pg loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        #approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - oldneglogpac))
        kloldnew = act_model.kl(train_model)
        approxkl = tf.reduce_mean(kloldnew)
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), cliprange)))

        # total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # Update the parameters using loss
        # 1. get the model parameters
        params = tf.trainable_variables('pi')

        # 2. Build our trainer
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=lr, epsilon=1e-5)

        # 3. Calculate the gradients
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        _train = trainer.apply_gradients(grads_and_var)

        # decoder_input action_length is speciallized for the training model
        def train(learning_reate, clipingrange, obs, obs_length,
                  returns, advs, decoder_inputs,  actions, decoder_full_length,
                  values, neglogpacs, states=None):
            # the advantage function is calculated as A(s,a) = R + yV(s') - V(s)
            # the return = R + yV(s')

            # Sequential Normalize the advantages
            advs = (advs - np.mean(advs, axis=0)) / (np.std(advs, axis=0) + 1e-8)

            td_map = {train_model.encoder_inputs: obs, train_model.encoder_lengths:obs_length,
                      decoder_input:decoder_inputs, action: actions, action_length: decoder_full_length, adv:advs,
                      r: returns, lr:learning_reate, cliprange:clipingrange, oldneglogpac:neglogpacs, oldvpred: values}

            return sess.run([pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1]

        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']

        self.train = train
        self.train_model = train_model
        self.time_major = self.train_model.time_major
        self.act_model = act_model
        self.step = act_model.step
        self.greedy_predict = act_model.greedy_predict

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.assign_old_eq_new = assign_old_eq_new

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
        sync_from_root(sess, global_variables)
Esempio n. 2
0
    def __init__(self, *, ob, ob_length, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()
        # sequential state

        # sequential action
        decoder_input = tf.placeholder(tf.int32, [None, None])
        action = tf.placeholder(tf.int32, [None, None])
        action_length = tf.placeholder(tf.int32, [None])
        # sequential adv
        adv = tf.placeholder(tf.float32, [None, None])
        # sequential reward
        r = tf.placeholder(tf.float32, [None, None])

        # keep track of old actor(sequential descision)
        oldneglogpac = tf.placeholder(tf.float32, [None, None])
        oldvpred = tf.placeholder(tf.float32, [None, None])
        lr = tf.placeholder(tf.float32, [])

        # Cliprange
        cliprange = tf.placeholder(tf.float32, [])

        train_model = Seq2seqPolicy("pi",
                                    hparams,
                                    reuse=True,
                                    encoder_inputs=ob,
                                    encoder_lengths=ob_length,
                                    decoder_inputs=decoder_input,
                                    decoder_full_length=action_length,
                                    decoder_targets=action)

        # define the policy gradient method for seq2seq neural network.

        pg_loss = -tf.reduce_mean(train_model.logp() * adv)
        vpred = train_model.vf
        vpredclipped = oldvpred + tf.clip_by_value(train_model.vf - oldvpred,
                                                   -cliprange, cliprange)

        # Unclipped value
        vf_losses1 = tf.square(vpred - r)
        vf_loss = tf.reduce_mean(vf_losses1)

        # vanilla policy gradient loss function
        loss = pg_loss + vf_loss * vf_coef
        # Update the parameters using loss
        # 1. get the model parameters
        params = tf.trainable_variables('pi')

        # 2. Build our trainer
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                   learning_rate=lr,
                                   epsilon=1e-5)

        # 3. Calculate the gradients
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        _train = trainer.apply_gradients(grads_and_var)

        # decoder_input action_length is speciallized for the training model
        def train(learning_reate,
                  clipingrange,
                  obs,
                  obs_length,
                  returns,
                  advs,
                  decoder_inputs,
                  actions,
                  decoder_full_length,
                  values,
                  neglogpacs,
                  states=None):
            # the advantage function is calculated as A(s,a) = R + yV(s') - V(s)
            # the return = R + yV(s')

            # Sequential Normalize the advantages
            advs = (advs - np.mean(advs, axis=0)) / (np.std(advs, axis=0) +
                                                     1e-8)

            td_map = {
                train_model.encoder_inputs: obs,
                train_model.encoder_lengths: obs_length,
                decoder_input: decoder_inputs,
                action: actions,
                action_length: decoder_full_length,
                adv: advs,
                r: returns,
                lr: learning_reate,
                cliprange: clipingrange,
                oldneglogpac: neglogpacs,
                oldvpred: values
            }

            return sess.run([pg_loss, vf_loss, _train], td_map)[:-1]

        self.loss_names = ['policy_loss', 'value_loss']

        self.train = train
        self.train_model = train_model
        self.time_major = self.train_model.time_major
        self.act_model = train_model

        self.step = self.act_model.step
        self.greedy_predict = self.act_model.greedy_predict

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        sync_from_root(sess, global_variables)