Exemple #1
0
    def update_policy_params(self, comm, loss, mpi_rank_weight, LR, max_grad_norm):
        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)

        return grads
Exemple #2
0
 def __init__(self,
              *,
              ac_space,
              policy_network,
              value_network=None,
              ent_coef,
              vf_coef,
              max_grad_norm):
     super(Model, self).__init__(name='PPO2Model')
     self.train_model = PolicyWithValue(ac_space,
                                        policy_network,
                                        value_network,
                                        estimate_q=False)
     if MPI is not None:
         self.optimizer = MpiAdamOptimizer(
             MPI.COMM_WORLD, self.train_model.trainable_variables)
     else:
         self.optimizer = tf.keras.optimizers.Adam()
     self.ent_coef = ent_coef
     self.vf_coef = vf_coef
     self.max_grad_norm = max_grad_norm
     self.step = self.train_model.step
     self.mode = self.train_model.mode
     self.value = self.train_model.value
     self.initial_state = self.train_model.initial_state
     self.loss_names = [
         'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
         'clipfrac'
     ]
     if MPI is not None:
         sync_from_root(self.variables)
Exemple #3
0
    def update_discriminator_params(self, comm, discriminator_loss, mpi_rank_weight, LR, max_grad_norm):
        # UPDATE DISCRIMINTATOR PARAMETERS USING DISCRIMINTATOR_LOSS
        # 1. Get the model parameters
        disc_params = tf.trainable_variables('discriminator_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.disc_trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.disc_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
            # self.disc_trainer = tf.train.GradientDescentOptimizer(learning_rate=LR)
        # 3. Calculate gradients
        disc_grads_and_var = self.disc_trainer.compute_gradients(discriminator_loss, disc_params)

        self._disc_train_op = self.disc_trainer.apply_gradients(disc_grads_and_var)
Exemple #4
0
    def get_train_op(self, loss, params, comm):
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            trainer = MpiAdamOptimizer(comm, learning_rate=self.LR,
                                            mpi_rank_weight=self.mpi_rank_weight, epsilon=1e-5)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if self.max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm)

        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
        grads_and_var = list(zip(grads, var))
        _train_op = trainer.apply_gradients(grads_and_var)
        return _train_op, grads_and_var
Exemple #5
0
    def update_vae_params(self, comm, loss, mpi_rank_weight, LR, max_grad_norm):
        params = tf.trainable_variables('vae') + tf.trainable_variables('ppo2_model/vae')
        if comm is not None and comm.Get_size() > 1:
            self.vae_trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.vae_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        grads_and_var = self.vae_trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.vae_grads = grads
        self.vae_var = var
        self.vae_train_op = self.vae_trainer.apply_gradients(grads_and_var)

        return grads
Exemple #6
0
    def update_all_params(self, comm, ppo_loss, disc_loss, mpi_rank_weight, LR, max_grad_norm):
        ppo_params = tf.trainable_variables('ppo2_model')
        disc_params = tf.trainable_variables('discriminator_model')

        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)

        ppo_var_and_grads = self.trainer.compute_gradients(ppo_loss, ppo_params)
        ppo_grads, ppo_var = zip(*ppo_var_and_grads)

        disc_var_and_grads = self.trainer.compute_gradients(disc_loss, disc_params)
        disc_grads, disc_var = zip(*disc_var_and_grads)

        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(ppo_grads + disc_grads, max_grad_norm)

        grads_and_var = list(zip(grads, ppo_var + disc_var))

        self.all_train_op = self.trainer.apply_gradients(grads_and_var)
Exemple #7
0
    def optimize(self, learning_rate=6.25e-5, epsilon=1.5e-4, **adam_kwargs):
        """
        Create a TF Op that optimizes the objective.

        Args:
          learning_rate: the Adam learning rate.
          epsilon: the Adam epsilon.
        """
        if self.comm is not None and self.comm.Get_size() > 1:
            optim = MpiAdamOptimizer(self.comm, learning_rate=learning_rate,
                                     mpi_rank_weight=self.mpi_rank_weight, epsilon=epsilon,
                                     **adam_kwargs)
        else:
            optim = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=epsilon,
                                           **adam_kwargs)
        if self.use_l2reg:
            params = tf.trainable_variables('online')
            weight_params = [v for v in params if '/bias' not in v.name]
            l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])
            self.loss = self.loss + l2_loss * 1e-4
        return optim.minimize(self.loss)
class Model(object):
    """
    We use this object to :
    __init__:
    - Creates the step_model
    - Creates the train_model
    train():
    - Make the training part (feedforward and retropropagation of gradients)
    save/load():
    - Save load the model
    """
    def __init__(self, ob_space, ac_space, ent_coef, vf_coef,
                max_grad_norm, mpi_rank_weight=1, comm=None,
                normalize_observations=True, normalize_returns=True,
                use_tensorboard=False, tb_log_dir=None):
        self.sess = sess = get_session()
        self.use_tensorboard = use_tensorboard

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        # CREATE OUR TWO MODELS
        network_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]
        vnetwork_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]

        # Act model that is used for both sampling
        act_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                stochastic=True, reuse=False, build_act=True,
                trainable_vars=None, not_trainable_vars=None,
                gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999,
                normalize_observations=normalize_observations, normalize_returns=normalize_returns)

        # Train model for training
        train_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                    stochastic=True, reuse=True, build_act=True,
                    trainable_vars=None, not_trainable_vars=None,
                    gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999,
                    normalize_observations=normalize_observations, normalize_returns=normalize_returns)
        
        # CREATE THE PLACEHOLDERS
        self.A = A = {k: v.sample_placeholder([None]) for k, v in train_model.pdtypes.items()}
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = sum([train_model.pds[k].neglogp(A[k]) for k in train_model.pdtypes.keys()])

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        #entropy = tf.reduce_mean(train_model.entropy)
        entropy = tf.reduce_mean(sum([train_model.pds[k].entropy() for k in train_model.pdtypes.keys()]))

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.scaled_value_tensor
        vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables(scope="ppo")
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model

        self.step = act_model.act
        self.value = act_model.value
        self.initial_state = act_model.zero_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101

        if self.use_tensorboard:
            self.attach_tensorboard(tb_log_dir)
            self.tb_step = 0

    def train(self, lr, cliprange, obs, actions, returns, values, neglogpacs, states=None):
        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
        # Returns = R + yV(s')
        advs = returns - values
        
        # Normalize the advantages
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        # Turn the obs into correct format
        td_map = {
            self.ADV : advs,
            self.R : returns,
            self.LR : lr,
            self.CLIPRANGE : cliprange,
            self.OLDNEGLOGPAC : neglogpacs,
            self.OLDVPRED : values,
        }

        obs_map = {self.train_model.phs[k]: v for k, v in obs.items()}
        td_map.update(obs_map)
        actions_map = {self.A[k]: v for k, v in actions.items()}
        td_map.update(actions_map)

        if states is not None:
            pass
            #td_map[self.train_model.phs['policy_net_lstm2_state_c']] = np.repeat([states['policy_net_lstm2_state_c'][0]], len(obs), 0)
            #td_map[self.train_model.phs['policy_net_lstm2_state_h']] = np.repeat([states['policy_net_lstm2_state_h'][0]], len(obs), 0)
            #td_map[self.train_model.phs['vpred_net_lstm2_state_c']] = np.repeat([states['vpred_net_lstm2_state_c'][0]], len(obs), 0)
            #td_map[self.train_model.phs['vpred_net_lstm2_state_h']] = np.repeat([states['vpred_net_lstm2_state_h'][0]], len(obs), 0)

        if self.use_tensorboard:
            losses = self.sess.run(self.stats_list + [self._train_op, self.merged], td_map)
            self.tb_writer.add_summary(losses.pop(), self.tb_step)
            self.tb_step += 1
            losses = losses[:-1]
        else:
            losses = self.sess.run(self.stats_list + [self._train_op], td_map)[:-1]

        return losses
    
    def attach_tensorboard(self, logdir):
        for i in range(len(self.stats_list)):
            tf.summary.scalar(self.loss_names[i], self.stats_list[i])
        self.merged = tf.summary.merge_all()
        logdir = os.path.join(os.getcwd(), logdir)
        logdir = os.path.join(logdir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        self.tb_writer = tf.summary.FileWriter(logdir, self.sess.graph)
Exemple #9
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            train_model = policy(None, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        A = train_model.pdtype.sample_placeholder([None])
        DIMSEL = train_model.pdtype.sample_placeholder([None])
        MEANNOW = train_model.pdtype.sample_placeholder([None])
        LOGSTDNOW = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        RHO_NOW = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        # Cliprange
        CLIPRANGE = tf.placeholder(tf.float32, [])
        KLCONST = tf.placeholder(tf.float32, [])
        KL_REST = tf.placeholder(tf.float32, [None])

        neglogpac = train_model.pd.neglogp(A)
        mean = train_model.pd.mean
        logstd = train_model.pd.logstd

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp((-0.5 * tf.square(
            (A - mean) / tf.exp(logstd)) - logstd + 0.5 * tf.square(
                (A - MEANNOW) / tf.exp(LOGSTDNOW)) + LOGSTDNOW) *
                       DIMSEL)  #* tf.minimum(1.0,RHO_NOW)
        r = tf.reduce_prod(ratio, axis=-1)
        # Defining Loss = - J is equivalent to max J
        pg_losses = -tf.reduce_prod(ratio,
                                    axis=-1) * ADV  #* tf.minimum(1.0,RHO_NOW)

        pg_losses2 = -tf.reduce_prod(tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                                      1.0 + CLIPRANGE),
                                     axis=-1) * ADV  #* tf.minimum(1.0,RHO_NOW)

        # Final PG loss
        # pg_loss = tf.reduce_mean(tf.stop_gradient(tf.maximum(pg_losses, pg_losses2))*(-neglogpac)) + .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))

        approxkl = .5 * tf.reduce_mean(
            tf.square(neglogpac - OLDNEGLOGPAC) * KL_REST)
        approxoldkl = .5 * tf.reduce_mean(
            tf.square(neglogpac - OLDNEGLOGPAC - tf.log(RHO_NOW)))
        kloldnew = tf.reduce_mean(
            tf.reduce_sum(
                logstd - LOGSTDNOW + 0.5 *
                (tf.square(tf.exp(LOGSTDNOW)) + tf.square(mean - MEANNOW)) /
                tf.square(tf.exp(logstd)) - 0.5,
                axis=1))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        pg_loss = tf.reduce_mean(tf.maximum(
            pg_losses,
            pg_losses2)) + KLCONST * approxkl  # * tf.minimum(1.0,RHO_NOW))
        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=LR,
                                       epsilon=1e-5)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  klconst,
                  dimsel,
                  obs,
                  returns,
                  advs,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  mean_now,
                  logstd_now,
                  rho_now,
                  kl_rest,
                  states=None):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            # Normalize the advantages
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                MEANNOW: mean_now,
                LOGSTDNOW: logstd_now,
                KLCONST: klconst,
                RHO_NOW: rho_now,
                KL_REST: kl_rest,
                DIMSEL: dimsel
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run([
                pg_loss, vf_loss, entropy, approxkl, clipfrac, kloldnew,
                approxoldkl, _train
            ], td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'kloldnew', 'approxoldkl'
        ]

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.meanlogstd = act_model.meanlogstd
        self.value = act_model.value
        self.values = train_model.value
        self.meanlogstds = train_model.meanlogstd
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")

        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
    def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale):

        sess = get_session()

        #TODO find a better way
        input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]]
        self.action_shape = 36

        # Placeholders
        self.state_ = phi_state = tf.placeholder(tf.float32,
                                                 [None, *input_shape],
                                                 name="icm_state")
        self.next_state_ = phi_next_state = tf.placeholder(
            tf.float32, [None, *input_shape], name="icm_next_state")
        self.action_ = action = tf.placeholder(tf.float32, [None],
                                               name="icm_action")

        with tf.variable_scope('icm_model'):
            # Feature encoding
            # Aka pass state and next_state to create phi(state), phi(next_state)
            # state --> phi(state)
            phi_state = self.feature_encoding(self.state_)

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                # next_state to phi(next_state)
                phi_next_state = self.feature_encoding(self.next_state_)

            # INVERSE MODEL
            pred_actions_logits, pred_actions_prob = self.inverse_model(
                phi_state, phi_next_state)

            # FORWARD MODEL
            pred_phi_next_state = self.forward_model(action, phi_state)

        # CALCULATE THE ICM LOSS
        # Inverse Loss LI
        # We calculate the cross entropy between our ât and at
        # Squeeze the labels (required)
        labels = tf.cast(action, tf.int32)

        self.inv_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=pred_actions_logits, labels=labels),
            name="inverse_loss")

        # Foward Loss
        # LF = 1/2 || pred_phi_next_state - phi_next_state ||
        # TODO 0.5 * ?
        self.forw_loss = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                        name="forward_loss")

        # Todo predictor lr scale ?
        # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale
        self.icm_loss = (
            (1 - beta) * self.inv_loss + beta * self.forw_loss) * icm_lr_scale

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        icm_params = tf.trainable_variables('icm_model')
        # 2. Build our trainer
        icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=1e-4,
                                       epsilon=1e-5)
        # 3. Calculate the gradients
        icm_grads_and_var = icm_trainer.compute_gradients(
            self.icm_loss, icm_params)
        icm_grads, icm_var = zip(*icm_grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            icm_grads, icm__grad_norm = tf.clip_by_global_norm(
                icm_grads, max_grad_norm)
        icm_grads_and_var = list(zip(icm_grads, icm_var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        _icm_train = icm_trainer.apply_gradients(icm_grads_and_var)

        if MPI.COMM_WORLD.Get_rank() == 0:
            print("Initialize")
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        print("GLOBAL VARIABLES", global_variables)
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
Exemple #11
0
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.observation_shape = observation_shape
        self.critic = critic
        self.actor = actor
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = tf.constant(actor_lr)
        self.critic_lr = tf.constant(critic_lr)

        # Observation normalization.
        if self.normalize_observations:
            with tf.name_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Return normalization.
        if self.normalize_returns:
            with tf.name_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs)
        self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs)

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise()

        if MPI is not None:
            comm = MPI.COMM_WORLD
            self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables)
            self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables)
        else:
            self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)

        logger.info('setting up actor optimizer')
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('setting up critic optimizer')
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = []
            for layer in self.critic.network_builder.layers[1:]:
                critic_reg_vars.append(layer.kernel)
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))

        logger.info('setting up critic target updates ...')
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        logger.info('setting up actor target updates ...')
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))

        if self.param_noise:
            logger.info('setting up param noise')
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()

        self.initial_state = None # recurrent architectures not supported yet
Exemple #12
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]


        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables) #pylint: disable=E1101
Exemple #13
0
class Model(tf.Module):
    """
    We use this object to :
    __init__:
    - Creates the step_model
    - Creates the train_model

    train():
    - Make the training part (feedforward and retropropagation of gradients)

    save/load():
    - Save load the model
    """
    def __init__(self,
                 *,
                 ac_space,
                 policy_network,
                 value_network=None,
                 ent_coef,
                 vf_coef,
                 max_grad_norm):
        super(Model, self).__init__(name='PPO2Model')
        self.train_model = PolicyWithValue(ac_space,
                                           policy_network,
                                           value_network,
                                           estimate_q=False)
        if MPI is not None:
            self.optimizer = MpiAdamOptimizer(
                MPI.COMM_WORLD, self.train_model.trainable_variables)
        else:
            self.optimizer = tf.keras.optimizers.Adam()
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.mode = self.train_model.mode
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        if MPI is not None:
            sync_from_root(self.variables)

    def train(self,
              lr,
              cliprange,
              obs,
              returns,
              masks,
              actions,
              values,
              neglogpac_old,
              states=None):
        grads, pg_loss, vf_loss, entropy, approxkl, clipfrac = self.get_grad(
            cliprange, obs, returns, masks, actions, values, neglogpac_old)
        if MPI is not None:
            self.optimizer.apply_gradients(grads, lr)
        else:
            self.optimizer.learning_rate = lr
            grads_and_vars = zip(grads, self.train_model.trainable_variables)
            self.optimizer.apply_gradients(grads_and_vars)

        return pg_loss, vf_loss, entropy, approxkl, clipfrac

    @tf.function
    def get_grad(self, cliprange, obs, returns, masks, actions, values,
                 neglogpac_old):
        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
        # Returns = R + yV(s')
        advs = returns - values

        # Normalize the advantages
        advs = (advs - tf.reduce_mean(advs)) / (tf.keras.backend.std(advs) +
                                                1e-8)

        with tf.GradientTape() as tape:
            policy_latent = self.train_model.policy_network(obs)
            pd, _ = self.train_model.pdtype.pdfromlatent(policy_latent)
            neglogpac = pd.neglogp(actions)
            entropy = tf.reduce_mean(pd.entropy())
            vpred = self.train_model.value(obs)
            vpredclipped = values + tf.clip_by_value(vpred - values,
                                                     -cliprange, cliprange)
            vf_losses1 = tf.square(vpred - returns)
            vf_losses2 = tf.square(vpredclipped - returns)
            vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

            ratio = tf.exp(neglogpac_old - neglogpac)
            pg_losses1 = -advs * ratio
            pg_losses2 = -advs * tf.clip_by_value(ratio, 1 - cliprange,
                                                  1 + cliprange)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))

            approxkl = .5 * tf.reduce_mean(
                tf.square(neglogpac - neglogpac_old))
            clipfrac = tf.reduce_mean(
                tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange),
                        tf.float32))

            loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

        var_list = self.train_model.trainable_variables
        grads = tape.gradient(loss, var_list)
        if self.max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
        if MPI is not None:
            grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0)
        return grads, pg_loss, vf_loss, entropy, approxkl, clipfrac
Exemple #14
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            act_model = policy(nbatch_act, 1, sess)
            train_model = policy(nbatch_train, nsteps, sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        params = tf.trainable_variables('ppo2_model')
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                   learning_rate=LR,
                                   epsilon=1e-5)
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
Exemple #15
0
class Model(object):
    def __init__(self,
                 *,
                 network,
                 env,
                 lr=3e-4,
                 cliprange=0.2,
                 nsteps=128,
                 nminibatches=4,
                 noptepochs=4,
                 ent_coef=0.0,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 gamma=0.99,
                 lam=0.95,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 load_path=None,
                 **network_kwargs):
        """
        Parameters:
        ----------

        network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                          specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                          tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                          neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                          See common/models.py/lstm for more details on using recurrent nets in policies.py

        env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                          The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


        lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                          training and 0 is the end of the training.

        cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                          and 0 is the end of the training

        nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                          nenv is number of environment copies simulated in parallel)


        nminibatches: int                 number of training minibatches per update. For recurrent policies.py,
                                          should be smaller or equal than number of environments run in parallel.

        noptepochs: int                   number of training epochs per update

        ent_coef: float                   policy entropy coefficient in the optimization objective

        vf_coef: float                    value function loss coefficient in the optimization objective

        gamma: float                      discounting factor

        lam: float                        advantage estimation discounting factor (lambda in the paper)

        log_interval: int                 number of timesteps between logging events

        load_path: str                    path to load the model from

        **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network
                                          For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

        """

        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        policy = build_policy(env, network, **network_kwargs)

        self.env = env

        if isinstance(lr, float):
            self.lr = constfn(lr)
        else:
            assert callable(lr)
        if isinstance(cliprange, float):
            self.cliprange = constfn(cliprange)
        else:
            assert callable(cliprange)
        self.nminibatches = nminibatches

        # if eval_env is not None:
        #     eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

        # Calculate the batch_size
        self.nenvs = self.env.num_envs
        self.nsteps = nsteps
        self.nbatch = self.nenvs * self.nsteps
        self.nbatch_train = self.nbatch // nminibatches
        self.noptepochs = noptepochs

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(self.nenvs, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(self.nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder(
            [None])  # action placeholder
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0),
                                   CLIPRANGE)))  # ratio 裁剪量

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS

        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.def_path_pre = os.path.dirname(
            os.path.abspath(__file__)) + '/tmp/'

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  # pylint: disable=E1101

        if load_path is not None:
            self.load_newest(load_path)

        # Instantiate the runner object
        self.runner = Runner(env=self.env,
                             model=self,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    def train(self,
              lr,
              cliprange,
              obs,
              returns,
              masks,
              actions,
              values,
              neglogpacs,
              states=None):
        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
        # Returns = R + yV(s')
        advs = returns - values

        # Normalize the advantages
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        td_map = {
            self.train_model.X: obs,
            self.A: actions,
            self.ADV: advs,
            self.R: returns,
            self.LR: lr,
            self.CLIPRANGE: cliprange,
            self.OLDNEGLOGPAC: neglogpacs,
            self.OLDVPRED: values
        }
        if states is not None:
            td_map[self.train_model.S] = states
            td_map[self.train_model.M] = masks

        return self.sess.run(self.stats_list + [self._train_op], td_map)[:-1]

    def learn(self,
              total_timesteps,
              seed=None,
              log_interval=10,
              save_interval=10):

        set_global_seeds(seed)
        total_timesteps = int(total_timesteps)

        # Calculate the batch_size
        is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

        epinfobuf = deque(maxlen=100)
        # if eval_env is not None:
        #     eval_epinfobuf = deque(maxlen=100)

        # Start total timer
        tfirststart = time.perf_counter()

        for update in range(1, total_timesteps):
            assert self.nbatch % self.nminibatches == 0
            # Start timer
            tstart = time.perf_counter()
            frac = 1.0 - (update - 1.0) / total_timesteps
            # Calculate the learning rate
            lrnow = self.lr(frac)
            # Calculate the cliprange
            cliprangenow = self.cliprange(frac)

            if update % log_interval == 0 and is_mpi_root:
                logger.info('Stepping environment...')

            # Get minibatch
            obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.runner.run(
            )  # pylint: disable=E0632
            # if eval_env is not None:
            #     eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run()  # pylint: disable=E0632

            if update % log_interval == 0 and is_mpi_root:
                logger.info('Done.')

            epinfobuf.extend(epinfos)
            # if eval_env is not None:
            #     eval_epinfobuf.extend(eval_epinfos)

            # Here what we're going to do is for each minibatch calculate the loss and append it.
            mblossvals = []
            if states is None:  # nonrecurrent version
                # Index of each element of batch_size
                # Create the indices array
                inds = np.arange(self.nbatch)
                for _ in range(self.noptepochs):
                    # Randomize the indexes
                    np.random.shuffle(inds)
                    # 0 to batch_size with batch_train_size step
                    for start in range(0, self.nbatch, self.nbatch_train):
                        end = start + self.nbatch_train
                        mbinds = inds[start:end]
                        slices = (arr[mbinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mblossvals.append(
                            self.train(lrnow, cliprangenow, *slices))
            else:  # recurrent version
                assert self.nenvs % self.nminibatches == 0
                envsperbatch = self.nenvs // self.nminibatches
                envinds = np.arange(self.nenvs)
                flatinds = np.arange(self.nenvs * self.nsteps).reshape(
                    self.nenvs, self.nsteps)
                for _ in range(self.noptepochs):
                    np.random.shuffle(envinds)
                    for start in range(0, self.nenvs, envsperbatch):
                        end = start + envsperbatch
                        mbenvinds = envinds[start:end]
                        mbflatinds = flatinds[mbenvinds].ravel()
                        slices = (arr[mbflatinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mbstates = states[mbenvinds]
                        mblossvals.append(
                            self.train(lrnow, cliprangenow, *slices, mbstates))

            # Feedforward --> get losses --> update
            lossvals = np.mean(mblossvals, axis=0)
            # End timer
            tnow = time.perf_counter()
            # Calculate the fps (frame per second)
            fps = int(self.nbatch / (tnow - tstart))

            if update % log_interval == 0 or update == 1:
                # Calculates if value function is a good predicator of the returns (ev > 1)
                # or if it's just worse than predicting nothing (ev =< 0)
                ev = explained_variance(values, returns)
                logger.record_tabular("misc/serial_timesteps",
                                      update * self.nsteps)
                logger.record_tabular("misc/nupdates", update)
                logger.record_tabular("misc/total_timesteps",
                                      update * self.nbatch)
                logger.record_tabular("fps", fps)
                logger.record_tabular("misc/explained_variance", float(ev))
                logger.record_tabular(
                    'eprewmean',
                    safe_mean([epinfo['r'] for epinfo in epinfobuf]))
                logger.record_tabular(
                    'eplenmean',
                    safe_mean([epinfo['l'] for epinfo in epinfobuf]))
                # if eval_env is not None:
                #   logger.record_tabular('eval_eprewmean', safe_mean([epinfo['r'] for epinfo in eval_epinfobuf]))
                #   logger.record_tabular('eval_eplenmean', safe_mean([epinfo['l'] for epinfo in eval_epinfobuf]))
                logger.record_tabular('misc/time_elapsed', tnow - tfirststart)
                for (lossval, lossname) in zip(lossvals, self.loss_names):
                    logger.record_tabular('loss/' + lossname, lossval)

                if is_mpi_root:
                    logger.dump_tabular()

            if save_interval and (update % save_interval == 0
                                  or update == 1) and is_mpi_root:
                file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S',
                                          time.localtime(time.time()))
                model_save_path = self.def_path_pre + file_name
                self.save(model_save_path)

        return self

    def save(self, save_path=None):
        save_variables(save_path=save_path, sess=self.sess)
        print('save model variables to', save_path)

    def load_newest(self, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(
            key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)))
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[-1])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)

    def load_index(self, index, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(
            key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)),
            reverse=True)
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[index])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)
Exemple #16
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            train_model = policy(nbatch_train, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        # Cliprange
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value
        # Get the value predicted
        vpred = train_model.vf
         # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange)
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)
        # Value loss 0.5 * SUM [max(unclipped, clipped)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Remember we want ratio (pi current policy / pi old policy)
        # But neglopac returns us -log(policy)
        # So we want to transform it into ratio
        # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old))
        # = new/old (since exponential function cancels log)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say
        # Loss = - J
        # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages
        pg_losses = -ADV * ratio

        # value, min [1 - e] , max [1 + e]
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting
        # the max of negative elements
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss (Remember that L = - J because it's the same thing than max J
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
        # 4. Backpropagation
        _train = trainer.apply_gradients(grads_and_var)

        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advs = returns - values

            # Normalize the advantages
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                td_map
            )[:-1]
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']


        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
        sync_from_root(sess, global_variables) #pylint: disable=E1101
    def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef,
                 max_grad_norm, seed, load_path, **network_kwargs):
        super(AgentModel, self).__init__(name='MAPPO2Model')
        set_global_seeds(seed)
        # Get state_space and action_space
        ob_space = agent.observation_space
        ac_space = agent.action_space

        if isinstance(network, str):
            network_type = network
            policy_network_fn = get_network_builder(network_type)(
                **network_kwargs)
            network = policy_network_fn(ob_space.shape)

        self.train_model = PolicyWithValue(ac_space, network)
        if MPI is not None:
            self.optimizer = MpiAdamOptimizer(
                MPI.COMM_WORLD, self.train_model.trainable_variables)
        else:
            self.optimizer = tf.keras.optimizers.Adam()

        # if isinstance(network, str):
        #     network = get_network_builder(network)(**network_kwargs)
        # policy_network = network(ob_space.shape)
        # value_network = network(ob_space.shape)
        # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network)
        # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables)
        # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables

        # if MPI is not None:
        #     self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list)
        #     self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list)
        # else:
        #     self.pi_optimizer = tf.keras.optimizers.Adam()
        #     self.vf_optimizer = tf.keras.optimizers.Adam()
        self.agent = agent
        self.nsteps = nsteps
        self.rho = rho
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
        self.loss_names = [
            'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss',
            'policy_entropy', 'approxkl', 'clipfrac'
        ]
        if MPI is not None:
            sync_from_root(self.variables)

        self.comm_matrix = agent.comm_matrix.copy()
        self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32)
        self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

        if load_path is not None:
            load_path = osp.expanduser(load_path)
            ckpt = tf.train.Checkpoint(model=self.train_model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 load_path,
                                                 max_to_keep=None)
            ckpt.restore(manager.latest_checkpoint)
class AgentModel(tf.Module):
    def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef,
                 max_grad_norm, seed, load_path, **network_kwargs):
        super(AgentModel, self).__init__(name='MAPPO2Model')
        set_global_seeds(seed)
        # Get state_space and action_space
        ob_space = agent.observation_space
        ac_space = agent.action_space

        if isinstance(network, str):
            network_type = network
            policy_network_fn = get_network_builder(network_type)(
                **network_kwargs)
            network = policy_network_fn(ob_space.shape)

        self.train_model = PolicyWithValue(ac_space, network)
        if MPI is not None:
            self.optimizer = MpiAdamOptimizer(
                MPI.COMM_WORLD, self.train_model.trainable_variables)
        else:
            self.optimizer = tf.keras.optimizers.Adam()

        # if isinstance(network, str):
        #     network = get_network_builder(network)(**network_kwargs)
        # policy_network = network(ob_space.shape)
        # value_network = network(ob_space.shape)
        # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network)
        # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables)
        # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables

        # if MPI is not None:
        #     self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list)
        #     self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list)
        # else:
        #     self.pi_optimizer = tf.keras.optimizers.Adam()
        #     self.vf_optimizer = tf.keras.optimizers.Adam()
        self.agent = agent
        self.nsteps = nsteps
        self.rho = rho
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
        self.loss_names = [
            'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss',
            'policy_entropy', 'approxkl', 'clipfrac'
        ]
        if MPI is not None:
            sync_from_root(self.variables)

        self.comm_matrix = agent.comm_matrix.copy()
        self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32)
        self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

        if load_path is not None:
            load_path = osp.expanduser(load_path)
            ckpt = tf.train.Checkpoint(model=self.train_model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 load_path,
                                                 max_to_keep=None)
            ckpt.restore(manager.latest_checkpoint)

    def reinitial_estimates(self):
        self.estimates = np.random.normal(
            0, 0.1, [self.agent.nmates, self.nsteps]).astype(np.float32)
        self.multipliers = np.random.uniform(
            0, 1, [self.agent.nmates, self.nsteps]).astype(np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

    def store_oldpi_var(self):
        pi_var_list = self.train_model.policy_network.trainable_variables + \
                      list(self.train_model.pdtype.trainable_variables)
        self.oldpi_var_list = [var.numpy() for var in pi_var_list]

    def assign_new_eq_old(self):
        pi_var_list = self.train_model.policy_network.trainable_variables + \
                      list(self.train_model.pdtype.trainable_variables)
        for pi_var, old_pi_var in zip(pi_var_list, self.oldpi_var_list):
            pi_var.assign(old_pi_var)

    # @tf.function
    # def get_vf_grad(self, cliprange, obs, returns, actions, values, advs, neglogpac_old):
    #     with tf.GradientTape() as tape:
    #         vpred = self.train_model.value(obs)
    #         vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange)
    #         vf_losses1 = tf.square(vpred - returns)
    #         vf_losses2 = tf.square(vpredclipped - returns)
    #         vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

    #     vf_grads = tape.gradient(vf_loss, self.vf_var_list)
    #     if self.max_grad_norm is not None:
    #         vf_grads, _ = tf.clip_by_global_norm(vf_grads, self.max_grad_norm)
    #     if MPI is not None:
    #         vf_grads = tf.concat([tf.reshape(g, (-1,)) for g in vf_grads], axis=0)

    #     return vf_grads, vf_loss

    @tf.function
    def get_pi_grad(self, cliprange, nb, estimates, multipliers, obs, returns,
                    actions, values, advs, neglogpac_old):
        with tf.GradientTape() as tape:
            policy_latent = self.train_model.policy_network(obs)
            pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent)
            neglogpac = pd.neglogp(actions)
            entropy = tf.reduce_mean(pd.entropy())

            vpred = self.train_model.value(obs)
            vpredclipped = values + tf.clip_by_value(vpred - values,
                                                     -cliprange, cliprange)
            vf_losses1 = tf.square(vpred - returns)
            vf_losses2 = tf.square(vpredclipped - returns)
            vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

            ratio = tf.exp(neglogpac_old - neglogpac)
            clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange,
                                             1 + cliprange)
            pg_losses1 = -advs * ratio
            pg_losses2 = -advs * clipped_ratio
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))

            comm = self.comm_matrix[self.comm_matrix[:,
                                                     nb] != 0][0,
                                                               self.agent.id]
            syncerr = comm * ratio - estimates
            sync_loss = tf.reduce_mean(multipliers * syncerr) + \
                        0.5 * self.rho * (tf.reduce_mean(tf.square(syncerr)))

            approxkl = .5 * tf.reduce_mean(
                tf.square(neglogpac - neglogpac_old))
            clipfrac = tf.reduce_mean(
                tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange),
                        tf.float32))

            loss = pg_loss + sync_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

        var_list = self.train_model.trainable_variables
        grads = tape.gradient(loss, var_list)
        if self.max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
        if MPI is not None:
            grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0)
        return grads, loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac

        # pi_grads = tape.gradient(pi_loss, self.pi_var_list)
        # if self.max_grad_norm is not None:
        #     pi_grads, _ = tf.clip_by_global_norm(pi_grads, self.max_grad_norm)
        # if MPI is not None:
        #     pi_grads = tf.concat([tf.reshape(g, (-1,)) for g in pi_grads], axis=0)
        # return pi_grads, pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac

    def pi_update(self, lr, cliprange, nb, obs, returns, actions, values, advs,
                  neglogpacs_old):
        estimates = self.estimates[nb]
        multipliers = self.multipliers[nb]
        pi_grads, pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac = self.get_pi_grad(
            cliprange, nb, estimates, multipliers, obs, returns, actions,
            values, advs, neglogpacs_old)

        if MPI is not None:
            self.optimizer.apply_gradients(pi_grads, lr)
        else:
            self.optimizer.learning_rate = lr
            grads_and_vars = zip(pi_grads,
                                 self.train_model.trainable_variables)
            self.optimizer.apply_gradients(grads_and_vars)

        return pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac

        # if MPI is not None:
        #     self.pi_optimizer.apply_gradients(pi_grads, lr)
        # else:
        #     self.pi_optimizer.learning_rate = lr
        #     grads_and_vars = zip(pi_grads, self.pi_var_list)
        #     self.pi_optimizer.apply_gradients(grads_and_vars)

        # return pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac

    # def vf_update(self, lr, cliprange, obs, returns, actions, values, advs, neglogpacs_old):
    #     vf_grads, vf_loss = self.get_vf_grad(
    #         cliprange, obs, returns, actions, values, advs, neglogpacs_old)
    #     if MPI is not None:
    #         self.vf_optimizer.apply_gradients(vf_grads, lr)
    #     else:
    #         self.vf_optimizer.learning_rate = lr
    #         grads_and_vars = zip(vf_grads, self.train_model.trainable_variables)
    #         self.vf_optimizer.apply_gradients(grads_and_vars)

    #     return vf_loss

    def info_to_exchange(self, cliprange, ob, ac, neglogpac_old, nb):
        policy_latent = self.train_model.policy_network(ob)
        pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent)
        neglogpac = pd.neglogp(ac)
        ratio = tf.exp(neglogpac_old - neglogpac)
        clipped_ratio = tf.clip_by_value(tf.exp(-neglogpac), 1 - cliprange,
                                         1 + cliprange)

        return ratio, self.multipliers[nb]

    def exchange(self, cliprange, ob, ac, neglogpac_old, nb_ratio,
                 nb_multipliers, nb):
        policy_latent = self.train_model.policy_network(ob)
        pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent)
        neglogpac = pd.neglogp(ac)
        ratio = tf.exp(neglogpac_old - neglogpac)
        clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange)
        comm = self.comm_matrix[self.comm_matrix[:, nb] != 0][0, self.agent.id]

        v = 0.5 * (self.multipliers[nb] + nb_multipliers) + \
            0.5 * self.rho * (comm * ratio + (-comm) * nb_ratio)
        estimate = np.array((1.0 / self.rho) * (self.multipliers[nb] - v) +
                            comm * ratio)

        self.estimates = tf.tensor_scatter_nd_update(self.estimates, [[nb]],
                                                     estimate[None, :])
        self.multipliers = tf.tensor_scatter_nd_update(self.multipliers,
                                                       [[nb]], v[None, :])
Exemple #19
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 microbatch_size=None,
                 l1regpi,
                 l2regpi,
                 l1regvf,
                 l2regvf,
                 wclippi,
                 wclipvf,
                 todropoutpi,
                 dropoutpi_keep_prob,
                 dropoutpi_keep_prob_value,
                 todropoutvf,
                 dropoutvf_keep_prob,
                 dropoutvf_keep_prob_value,
                 isbnpitrainmode,
                 isbnvftrainmode):
        self.sess = sess = get_session()
        #REGULARIZATION
        self.toregularizepi = l1regpi > 0 or l2regpi > 0
        self.toregularizevf = l1regvf > 0 or l2regvf > 0
        self.todropoutpi = todropoutpi
        self.todropoutvf = todropoutvf
        self.dropoutpi_keep_prob = dropoutpi_keep_prob  #TENSOR
        self.dropoutpi_keep_prob_value = dropoutpi_keep_prob_value
        self.dropoutvf_keep_prob = dropoutvf_keep_prob
        self.dropoutvf_keep_prob_value = dropoutvf_keep_prob_value
        self.isbnpitrainmode = isbnpitrainmode
        self.isbnvftrainmode = isbnvftrainmode
        self.toweightclippi = wclippi > 0
        self.toweightclipvf = wclipvf > 0

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)
            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        if self.toregularizepi:
            print("regularizing policy network: L1 = {}, L2 = {}".format(
                l1regpi, l2regpi))
            regularizerpi = tf.contrib.layers.l1_l2_regularizer(
                scale_l1=l1regpi, scale_l2=l2regpi, scope='ppo2_model/pi')
            all_trainable_weights_pi = tf.trainable_variables('ppo2_model/pi')
            regularization_penalty_pi = tf.contrib.layers.apply_regularization(
                regularizerpi, all_trainable_weights_pi)
            loss = loss + regularization_penalty_pi
        if self.toregularizevf:
            print("regularizing value network: L1 = {}, L2 = {}".format(
                l1regvf, l2regvf))
            regularizervf = tf.contrib.layers.l1_l2_regularizer(
                scale_l1=l1regvf, scale_l2=l2regvf, scope='ppo2_model/vf')
            all_trainable_weights_vf = tf.trainable_variables('ppo2_model/vf')
            regularization_penalty_vf = tf.contrib.layers.apply_regularization(
                regularizervf, all_trainable_weights_vf)
            loss = loss + regularization_penalty_vf

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        #self._update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        #with tf.control_dependencies(self._update_op):
        grads_and_var = self.trainer.compute_gradients(loss, params)

        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)

        if self.toweightclippi:
            print("clipping policy network = {}".format(wclippi))
            policyparams = tf.trainable_variables('ppo2_model/pi')
            self._wclip_ops_pi = []
            for toclipvar in policyparams:
                if 'logstd' in toclipvar.name:
                    continue
                self._wclip_ops_pi.append(
                    tf.assign(toclipvar,
                              tf.clip_by_value(toclipvar, -wclippi, wclippi)))
            self._wclip_op_pi = tf.group(*self._wclip_ops_pi)
        if self.toweightclipvf:
            print("clipping value network = {}".format(wclipvf))
            valueparams = tf.trainable_variables('ppo2_model/vf')
            self._wclip_ops_vf = []
            for toclipvar in valueparams:
                self._wclip_ops_vf.append(
                    tf.assign(toclipvar,
                              tf.clip_by_value(toclipvar, -wclipvf, wclipvf)))
            self._wclip_op_vf = tf.group(*self._wclip_ops_vf)

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        if self.toregularizepi:
            self.loss_names.append('regularization_pi')
            self.stats_list.append(regularization_penalty_pi)
        if self.toregularizevf:
            self.loss_names.append('regularization_vf')
            self.stats_list.append(regularization_penalty_vf)

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Exemple #20
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            act_model = policy(nbatch_act, 1, sess)
            train_model = policy(nbatch_train, nsteps, sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        params = tf.trainable_variables('ppo2_model')
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                   learning_rate=LR,
                                   epsilon=1e-5)
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        # def save(file_name):
        #     save_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/"
        #     ps = sess.run(params)
        #     joblib.dump(ps, save_path+file_name)
        #     print("\n------------\nModel with name '{}' saved successfully!\n------------\n".format(file_name))
        #
        # def load(path_to_file):
        #     load_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/promising_ones/"
        #     file_name = LOAD_FILENAME
        #     if path_to_file is None:
        #         path_to_file = load_path + file_name
        #     loaded_params = joblib.load(path_to_file)
        #     restores = []
        #     for p, loaded_p in zip(params, loaded_params):
        #         restores.append(p.assign(loaded_p))
        #     sess.run(restores)
        #     print("Model with name '{}' was successfully loaded!".format(file_name))

        # was uncommented
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        # uncommented
        # self.save = save # functools.partial(save_variables, sess=sess)
        # self.load = load # functools.partial(load_variables, sess=sess)

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
    def __init__(self, ob_space, ac_space, ent_coef, vf_coef,
                max_grad_norm, mpi_rank_weight=1, comm=None,
                normalize_observations=True, normalize_returns=True,
                use_tensorboard=False, tb_log_dir=None):
        self.sess = sess = get_session()
        self.use_tensorboard = use_tensorboard

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        # CREATE OUR TWO MODELS
        network_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]
        vnetwork_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]

        # Act model that is used for both sampling
        act_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                stochastic=True, reuse=False, build_act=True,
                trainable_vars=None, not_trainable_vars=None,
                gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999,
                normalize_observations=normalize_observations, normalize_returns=normalize_returns)

        # Train model for training
        train_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                    stochastic=True, reuse=True, build_act=True,
                    trainable_vars=None, not_trainable_vars=None,
                    gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999,
                    normalize_observations=normalize_observations, normalize_returns=normalize_returns)
        
        # CREATE THE PLACEHOLDERS
        self.A = A = {k: v.sample_placeholder([None]) for k, v in train_model.pdtypes.items()}
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = sum([train_model.pds[k].neglogp(A[k]) for k in train_model.pdtypes.keys()])

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        #entropy = tf.reduce_mean(train_model.entropy)
        entropy = tf.reduce_mean(sum([train_model.pds[k].entropy() for k in train_model.pdtypes.keys()]))

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.scaled_value_tensor
        vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables(scope="ppo")
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model

        self.step = act_model.act
        self.value = act_model.value
        self.initial_state = act_model.zero_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101

        if self.use_tensorboard:
            self.attach_tensorboard(tb_log_dir)
            self.tb_step = 0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 lf_coef,
                 max_grad_norm,
                 init_labda=1.,
                 microbatch_size=None,
                 threshold=1.):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_lyapunov_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.l_ADV = l_ADV = tf.placeholder(tf.float32, [None])
        # 这两个R都是带衰减的R
        self.R = R = tf.placeholder(tf.float32, [None])

        self.v_l = v_l = tf.placeholder(tf.float32, [None])
        log_labda = tf.get_variable('ppo2_lyapunov_model/Labda',
                                    None,
                                    tf.float32,
                                    initializer=tf.log(init_labda))
        self.labda = tf.exp(log_labda)

        self.safety_threshold = tf.placeholder(tf.float32, None, 'threshold')

        self.threshold = threshold
        # self.log_labda = tf.placeholder(tf.float32, None, 'Labda')
        # self.labda = tf.constant(10.)
        # self.Lam=10.

        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.OLDLPRED = OLDLPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Get the predicted value
        lpred = train_model.lf
        lpredclipped = OLDLPRED + tf.clip_by_value(train_model.lf - OLDLPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        lf_losses1 = tf.square(lpred - v_l)
        # Clipped value
        lf_losses2 = tf.square(lpredclipped - v_l)

        lf_loss = .5 * tf.reduce_mean(tf.maximum(lf_losses1, lf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining safety loss

        lpred = train_model.lf
        lpred_ = train_model.lf_
        # self.l_lambda = tf.reduce_mean(ratio *  tf.stop_gradient(lpred_) - tf.stop_gradient(lpred))
        l_lambda1 = tf.reduce_mean(ratio * l_ADV + v_l - self.safety_threshold)
        l_lambda2 = tf.reduce_mean(
            tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) * l_ADV +
            v_l - self.safety_threshold)

        l_lambda = tf.maximum(l_lambda1, l_lambda2)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))+ l_lambda*tf.stop_gradient(self.labda) - \
                  tf.stop_gradient(l_lambda) * log_labda
        # pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)+ self.l_lambda * self.labda)
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + lf_loss * lf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_lyapunov_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'safety_value_loss', 'policy_entropy',
            'approxkl', 'clipfrac', 'lagrangian'
        ]
        self.stats_list = [
            pg_loss, vf_loss, lf_loss, entropy, approxkl, clipfrac, self.labda
        ]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.eval_step = act_model.eval_step
        self.value = act_model.value
        self.l_value = act_model.l_value
        self.l_value_ = act_model.l_value_
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Exemple #23
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 proportion_of_exp_used_for_predictor_update,
                 microbatch_size=None):
        self.sess = sess = get_session()

        with tf.variable_scope('rnd_ppo_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # Create our RND model that will generate our intrinsic rewards
        rnd_model = RND(ob_space, proportion_of_exp_used_for_predictor_update)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.INT_R = INT_R = tf.placeholder(tf.float32, [None])
        self.EXT_R = EXT_R = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])

        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean(
            tf.square(train_model.vf_int - self.INT_R))
        vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean(
            tf.square(train_model.vf_ext - self.EXT_R))
        vf_loss = vf_loss_int + vf_loss_ext

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss + rnd_model.rnd_loss

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('rnd_ppo_model')

        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'rnd_loss', 'policy_entropy',
            'approxkl', 'clipfrac'
        ]
        self.stats_list = [
            pg_loss, vf_loss, rnd_model.rnd_loss, entropy, approxkl, clipfrac
        ]

        self.train_model = train_model
        self.act_model = act_model
        self.rnd_model = rnd_model
        self.step = act_model.step
        self.values = act_model.values
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Exemple #24
0
class Model(object):
    """
    We use this object to :
    __init__:
    - Creates the step_model
    - Creates the train_model

    train():
    - Make the training part (feedforward and retropropagation of gradients)

    save/load():
    - Save load the model
    """
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 microbatch_size=None):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            #创建策略网络和值网络的时候指定batchsize构建placeholder
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101

    def train(self,
              lr,
              cliprange,
              obs,
              returns,
              masks,
              actions,
              values,
              neglogpacs,
              states=None):
        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
        # Returns = R + yV(s')
        advs = returns - values

        # Normalize the advantages
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        td_map = {
            self.train_model.X: obs,
            self.A: actions,
            self.ADV: advs,
            self.R: returns,
            self.LR: lr,
            self.CLIPRANGE: cliprange,
            self.OLDNEGLOGPAC: neglogpacs,
            self.OLDVPRED: values
        }
        if states is not None:
            td_map[self.train_model.S] = states
            td_map[self.train_model.M] = masks

        return self.sess.run(self.stats_list + [self._train_op], td_map)[:-1]
Exemple #25
0
class DDPG(tf.Module):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.observation_shape = observation_shape
        self.critic = critic
        self.actor = actor
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = tf.constant(actor_lr)
        self.critic_lr = tf.constant(critic_lr)

        # Observation normalization.
        if self.normalize_observations:
            with tf.name_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        # Return normalization.
        if self.normalize_returns:
            with tf.name_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs)
        self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs)

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise()

        if MPI is not None:
            comm = MPI.COMM_WORLD
            self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables)
            self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables)
        else:
            self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
            self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)

        logger.info('setting up actor optimizer')
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        logger.info('setting up critic optimizer')
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = []
            for layer in self.critic.network_builder.layers[1:]:
                critic_reg_vars.append(layer.kernel)
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))

        logger.info('setting up critic target updates ...')
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))
        logger.info('setting up actor target updates ...')
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            logger.info('  {} <- {}'.format(target_var.name, var.name))

        if self.param_noise:
            logger.info('setting up param noise')
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables):
                if var in actor.perturbable_vars:
                    logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
                else:
                    logger.info('  {} <- {}'.format(perturbed_var.name, var.name))

        if self.normalize_returns and self.enable_popart:
            self.setup_popart()

        self.initial_state = None # recurrent architectures not supported yet


    def setup_param_noise(self):
        assert self.param_noise is not None

        # Configure perturbed actor.
        self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)

        # Configure separate copy for stddev adoption.
        self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1

    @tf.function
    def step(self, obs, apply_noise=True, compute_Q=True):
        normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1])
        actor_tf = self.actor(normalized_obs)
        if self.param_noise is not None and apply_noise:
            action = self.perturbed_actor(normalized_obs)
        else:
            action = actor_tf

        if compute_Q:
            normalized_critic_with_actor_tf = self.critic(normalized_obs, actor_tf)
            q = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            action += noise
        action = tf.clip_by_value(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        batch = self.memory.sample(batch_size=self.batch_size)
        obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1'])
        actions, rewards, terminals1 = tf.constant(batch['actions']), tf.constant(batch['rewards']), tf.constant(batch['terminals1'], dtype=tf.float32)
        normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(obs0, obs1, rewards, terminals1)

        if self.normalize_returns and self.enable_popart:
            old_mean = self.ret_rms.mean
            old_std = self.ret_rms.std
            self.ret_rms.update(target_Q.flatten())
            # renormalize Q outputs
            new_mean = self.ret_rms.mean
            new_std = self.ret_rms.std
            for vs in [self.critic.output_vars, self.target_critic.output_vars]:
                kernel, bias = vs
                kernel.assign(kernel * old_std / new_std)
                bias.assign((bias * old_std + old_mean - new_mean) / new_std)


        actor_grads, actor_loss = self.get_actor_grads(normalized_obs0)
        critic_grads, critic_loss = self.get_critic_grads(normalized_obs0, actions, target_Q)

        if MPI is not None:
            self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr)
            self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr)
        else:
            self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
            self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

        return critic_loss, actor_loss

    @tf.function
    def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1):
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1])
        Q_obs1 = denormalize(self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms)
        target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1
        return normalized_obs0, target_Q

    @tf.function
    def get_actor_grads(self, normalized_obs0):
        with tf.GradientTape() as tape:
            actor_tf = self.actor(normalized_obs0)
            normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
            critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
            actor_loss = -tf.reduce_mean(critic_with_actor_tf)
        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        if self.clip_norm:
            actor_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads]
        if MPI is not None:
            actor_grads = tf.concat([tf.reshape(g, (-1,)) for g in actor_grads], axis=0)
        return actor_grads, actor_loss

    @tf.function
    def get_critic_grads(self, normalized_obs0, actions, target_Q):
        with tf.GradientTape() as tape:
            normalized_critic_tf = self.critic(normalized_obs0, actions)
            normalized_critic_target_tf = tf.clip_by_value(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1])
            critic_loss = tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf))
            # The first is input layer, which is ignored here.
            if self.critic_l2_reg > 0.:
                # Ignore the first input layer.
                for layer in self.critic.network_builder.layers[1:]:
                    # The original l2_regularizer takes half of sum square.
                    critic_loss += (self.critic_l2_reg / 2.)* tf.reduce_sum(tf.square(layer.kernel))
        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        if self.clip_norm:
            critic_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads]
        if MPI is not None:
            critic_grads = tf.concat([tf.reshape(g, (-1,)) for g in critic_grads], axis=0)
        return critic_grads, critic_loss


    def initialize(self):
        if MPI is not None:
            sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables)
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    @tf.function
    def update_target_net(self):
        for var, target_var in zip(self.actor.variables, self.target_actor.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)
        for var, target_var in zip(self.critic.variables, self.target_critic.variables):
            target_var.assign((1. - self.tau) * target_var + self.tau * var)

    def get_stats(self):

        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        obs0 = self.stats_sample['obs0']
        actions = self.stats_sample['actions']
        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        normalized_critic_tf = self.critic(normalized_obs0, actions)
        critic_tf = denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        actor_tf = self.actor(normalized_obs0)
        normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
        critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)

        stats = {}
        if self.normalize_returns:
            stats['ret_rms_mean'] = self.ret_rms.mean
            stats['ret_rms_std'] = self.ret_rms.std
        if self.normalize_observations:
            stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean)
            stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std)
        stats['reference_Q_mean'] = tf.reduce_mean(critic_tf)
        stats['reference_Q_std'] = reduce_std(critic_tf)
        stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf)
        stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf)
        stats['reference_action_mean'] = tf.reduce_mean(actor_tf)
        stats['reference_action_std'] = reduce_std(actor_tf)

        if self.param_noise:
            perturbed_actor_tf = self.perturbed_actor(normalized_obs0)
            stats['reference_perturbed_action_mean'] = tf.reduce_mean(perturbed_actor_tf)
            stats['reference_perturbed_action_std'] = reduce_std(perturbed_actor_tf)
            stats.update(self.param_noise.get_stats())
        return stats


    
    def adapt_param_noise(self, obs0):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        mean_distance = self.get_mean_distance(obs0).numpy()

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()

        self.param_noise.adapt(mean_distance)
        return mean_distance

    @tf.function
    def get_mean_distance(self, obs0):
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        update_perturbed_actor(self.actor, self.perturbed_adaptive_actor, self.param_noise.current_stddev)

        normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
        actor_tf = self.actor(normalized_obs0)
        adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0)
        mean_distance = tf.sqrt(tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf)))
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            update_perturbed_actor(self.actor, self.perturbed_actor, self.param_noise.current_stddev)
Exemple #26
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 mix_mode='nomix',
                 mix_alpha=0.2,
                 mix_beta=0.2,
                 fix_representation=False,
                 use_l2reg=False,
                 l2reg_coeff=1e-4):
        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train,
                                     nsteps,
                                     sess,
                                     mix_mode=mix_mode)
            else:
                train_model = policy(microbatch_size,
                                     nsteps,
                                     sess,
                                     mix_mode=mix_mode)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        # Interpolating the supervision
        if mix_mode == 'mixreg':
            # get coeff and indices
            coeff = train_model.coeff
            indices = train_model.indices
            other_indices = train_model.other_indices
            # mixup
            OLDNEGLOGPAC = coeff * tf.gather(OLDNEGLOGPAC, indices, axis=0) \
                    + (1 - coeff) * tf.gather(
                            OLDNEGLOGPAC, other_indices, axis=0)
            OLDVPRED = coeff * tf.gather(OLDVPRED, indices, axis=0) \
                    + (1 - coeff) * tf.gather(OLDVPRED, other_indices, axis=0)
            R = coeff * tf.gather(R, indices, axis=0) \
                    + (1 - coeff) * tf.gather(R, other_indices, axis=0)
            ADV = coeff * tf.gather(ADV, indices, axis=0) \
                    + (1 - coeff) * tf.gather(ADV, other_indices, axis=0)
            A = tf.gather(A, indices, axis=0)
        elif mix_mode == 'mixobs':
            # get indices
            indices = train_model.indices
            # gather
            OLDNEGLOGPAC = tf.gather(OLDNEGLOGPAC, train_model.indices, axis=0)
            OLDVPRED = tf.gather(OLDVPRED, train_model.indices, axis=0)
            R = tf.gather(R, train_model.indices, axis=0)
            ADV = tf.gather(ADV, train_model.indices, axis=0)
            A = tf.gather(A, train_model.indices, axis=0)
        elif mix_mode == 'nomix':
            pass
        else:
            raise ValueError(f"Unknown mixing mode: {mix_mode} !")

        # Store the nodes to be recorded
        self.loss_names = []
        self.stats_list = []

        ############ CALCULATE LOSS ############
        # Total loss = Policy gradient loss - entropy * entropy coefficient
        #   + Value coefficient * value loss

        # Normalizing advantage
        ADV = (ADV - tf.reduce_mean(ADV)) / (reduce_std(ADV) + 1e-8)

        # Calculate the entropy
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Calculate value loss
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate policy gradient loss
        neglogpac = train_model.pd.neglogp(A)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # Record some information
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        self.loss_names.extend([
            'total_loss',
            'policy_loss',
            'value_loss',
            'policy_entropy',
            'approxkl',
            'clipfrac',
        ])
        self.stats_list.extend([
            loss,
            pg_loss,
            vf_loss,
            entropy,
            approxkl,
            clipfrac,
        ])
        ############################################

        ############ UPDATE THE PARAMETERS ############
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        if use_l2reg:
            weight_params = [v for v in params if '/b' not in v.name]
            l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])
            self.loss_names.append('l2_loss')
            self.stats_list.append(l2_loss)
            loss = loss + l2_loss * l2reg_coeff
        if fix_representation:
            params = params[-4:]
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)
        # 4. Clip the gradient if required
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        ###############################################

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self._init_op = tf.variables_initializer(params)
        self._sync_param = lambda: sync_from_root(sess, params, comm=comm)

        self.mix_mode = mix_mode
        self.mix_alpha = mix_alpha
        # JAG: Add beta parameter
        self.mix_beta = mix_beta
        self.fix_representation = fix_representation
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.adv_gradient = act_model.adv_gradient
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        # Exclude the random convolution layer from syncing
        global_variables = [
            v for v in global_variables if 'randcnn' not in v.name
        ]
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)
Exemple #27
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 microbatch_size=None,
                 unsupType='action'):
        self.sess = sess = get_session()

        # icm parameters
        self.unsup = unsupType is not None
        predictor = None
        self.numaction = ac_space.n
        designHead = 'universe'

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        if self.unsup:
            with tf.variable_scope("predictor", reuse=tf.AUTO_REUSE):
                if 'state' in unsupType:
                    self.local_ap_network = predictor = StatePredictor(
                        ob_space, ac_space, designHead, unsupType)
                else:
                    self.local_ap_network = predictor = StateActionPredictor(
                        ob_space, ac_space, designHead)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # computing predictor loss
        predloss = None
        if self.unsup:
            if 'state' in unsupType:
                predloss = constants[
                    'PREDICTION_LR_SCALE'] * predictor.forwardloss
            else:
                predloss = constants['PREDICTION_LR_SCALE'] * (
                    predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) +
                    predictor.forwardloss * constants['FORWARD_LOSS_WT'])

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)

        if self.unsup:
            predgrads_and_var = self.trainer.compute_gradients(
                predloss * 20.0, predictor.var_list)

        grads, var = zip(*grads_and_var)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # clip predictor gradients
        if self.unsup:
            predgrads, _ = zip(*predgrads_and_var)
            predgrads, _ = tf.clip_by_global_norm(predgrads,
                                                  constants['GRAD_NORM_CLIP'])
            predgrads_and_var = list(zip(predgrads, predictor.var_list))

            # combine the policy and predictor grads and vars
            grads_and_var = grads_and_var + predgrads_and_var
            # unzip the grads and var after adding predictor grads/vars
            grads, var = zip(*grads_and_var)

            # normalize gradients for logging
            predgrad_global_norm = tf.global_norm(predgrads)

        # normalize gradients for logging
        grad_global_norm = tf.global_norm(grads)

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'grad_global_norm'
        ]
        self.stats_list = [
            pg_loss, vf_loss, entropy, approxkl, clipfrac, grad_global_norm
        ]

        if self.unsup:
            self.loss_names += [
                'predloss', 'pred_forwardloss', 'pred_invloss',
                'predgrad_global_norm'
            ]
            self.stats_list += [
                predloss, predictor.forwardloss, predictor.invloss,
                predgrad_global_norm
            ]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        # prediction bonus function for icm
        self.pred_bonus = predictor.pred_bonus
        self.pred_bonuses = predictor.pred_bonuses

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Exemple #28
0
    def __init__(self,
                 *,
                 network,
                 env,
                 lr=3e-4,
                 cliprange=0.2,
                 nsteps=128,
                 nminibatches=4,
                 noptepochs=4,
                 ent_coef=0.0,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 gamma=0.99,
                 lam=0.95,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 load_path=None,
                 **network_kwargs):
        """
        Parameters:
        ----------

        network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                          specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                          tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                          neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                          See common/models.py/lstm for more details on using recurrent nets in policies.py

        env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                          The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


        lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                          training and 0 is the end of the training.

        cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                          and 0 is the end of the training

        nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                          nenv is number of environment copies simulated in parallel)


        nminibatches: int                 number of training minibatches per update. For recurrent policies.py,
                                          should be smaller or equal than number of environments run in parallel.

        noptepochs: int                   number of training epochs per update

        ent_coef: float                   policy entropy coefficient in the optimization objective

        vf_coef: float                    value function loss coefficient in the optimization objective

        gamma: float                      discounting factor

        lam: float                        advantage estimation discounting factor (lambda in the paper)

        log_interval: int                 number of timesteps between logging events

        load_path: str                    path to load the model from

        **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network
                                          For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

        """

        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        policy = build_policy(env, network, **network_kwargs)

        self.env = env

        if isinstance(lr, float):
            self.lr = constfn(lr)
        else:
            assert callable(lr)
        if isinstance(cliprange, float):
            self.cliprange = constfn(cliprange)
        else:
            assert callable(cliprange)
        self.nminibatches = nminibatches

        # if eval_env is not None:
        #     eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

        # Calculate the batch_size
        self.nenvs = self.env.num_envs
        self.nsteps = nsteps
        self.nbatch = self.nenvs * self.nsteps
        self.nbatch_train = self.nbatch // nminibatches
        self.noptepochs = noptepochs

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(self.nenvs, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(self.nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder(
            [None])  # action placeholder
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0),
                                   CLIPRANGE)))  # ratio 裁剪量

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS

        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.def_path_pre = os.path.dirname(
            os.path.abspath(__file__)) + '/tmp/'

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  # pylint: disable=E1101

        if load_path is not None:
            self.load_newest(load_path)

        # Instantiate the runner object
        self.runner = Runner(env=self.env,
                             model=self,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)
Exemple #29
0
class ICM(object):
    def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale,
                 idf):

        sess = get_session()

        #TODO find a better way
        input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]]

        # input_shape = ob_space
        print("ICM state Input shape ", np.shape(input_shape), "  ",
              input_shape)
        self.action_shape = 36
        self.idf = idf

        # Placeholders

        self.state_ = phi_state = tf.placeholder(tf.float32,
                                                 [None, *input_shape],
                                                 name="icm_state")
        self.next_state_ = phi_next_state = tf.placeholder(
            tf.float32, [None, *input_shape], name="icm_next_state")
        self.action_ = action = tf.placeholder(tf.float32, [None],
                                               name="icm_action")
        # self.R = rewards = tf.placeholder(tf.float32, shape=[None], name="maxR")

        with tf.variable_scope('icm_model'):
            # Feature encoding
            # Aka pass state and next_state to create phi(state), phi(next_state)
            # state --> phi(state)
            print("Feature Encodding of phi state with shape :: ", self.state_)
            phi_state = self.feature_encoding(self.state_)

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                # next_state to phi(next_state)
                phi_next_state = self.feature_encoding(self.next_state_)

            # INVERSE MODEL
            if self.idf:
                pred_actions_logits, pred_actions_prob = self.inverse_model(
                    phi_state, phi_next_state)

            # FORWARD MODEL
            pred_phi_next_state = self.forward_model(action, phi_state)

        # CALCULATE THE ICM LOSS
        # Inverse Loss LI
        # We calculate the cross entropy between our ât and at
        # Squeeze the labels (required)
        labels = tf.cast(action, tf.int32)

        print("prediction pred_actions_logits")
        if self.idf:
            self.inv_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=pred_actions_logits, labels=labels),
                name="inverse_loss")

        # Foward Loss
        # LF = 1/2 || pred_phi_next_state - phi_next_state ||
        # TODO 0.5 * ?
        self.forw_loss_axis = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                             axis=-1,
                                             name="forward_loss_axis")

        self.forw_loss = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                        name="forward_loss")

        # Todo predictor lr scale ?
        # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale
        if self.idf:
            self.icm_loss = ((1 - beta) * self.inv_loss + beta * self.forw_loss
                             )  #* icm_lr_scale
        else:
            self.icm_loss = self.forw_loss

        ####
        # self.icm_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        # print("ICM var list ::: " , self.icm_var_list)
        ####

        #
        # if max_grad_norm is not None :
        # t_icm_grads , _ = tf.clip_by_global_norm(self.icm_loss, constants['GRAD_NORM_CLIP'] )
        # t_icm_grads_and_vars = list(zip(self.icm_loss , self.icm_var_list))
        # print("\n\n\nit works \n\n\n")
        #

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        self.icm_params = tf.trainable_variables(
            'icm_model')  ## var_list same as

        ## testing phase
        self.predgrads = tf.gradients(self.icm_loss, self.icm_params)
        self.predgrads, _ = tf.clip_by_global_norm(self.predgrads,
                                                   max_grad_norm)
        self.pred_grads_and_vars = list(zip(self.predgrads, self.icm_params))

        ## testing phase

        # print("\n\nTrainable variables \n ",icm_params)
        # # 2. Build our trainer
        self.icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=1e-3,
                                            epsilon=1e-5)
        # # 3. Calculate the gradients
        icm_grads_and_var = self.icm_trainer.compute_gradients(
            self.icm_loss, self.icm_params)
        # # t_grads_and_var = tf.gradients()
        icm_grads, icm_var = zip(*icm_grads_and_var)

        if max_grad_norm is not None:
            #     # Clip the gradients (normalize)
            icm_grads, icm__grad_norm = tf.clip_by_global_norm(
                icm_grads, max_grad_norm)
        icm_grads_and_var = list(zip(icm_grads, icm_var))
        # # zip aggregate each gradient with parameters associated
        # # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self._icm_train = self.icm_trainer.apply_gradients(icm_grads_and_var)

        if MPI.COMM_WORLD.Get_rank() == 0:
            print("Initialize")
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        # print("GLOBAL VARIABLES", global_variables)
        sync_from_root(sess, global_variables)  #pylint: disable=E1101

    # We use batch normalization to do feature normalization as explained in the paper
    # using the universe head,
    def feature_encoding(self, x):
        print("feature function called !!")
        x = tf.nn.elu(
            tf.layers.batch_normalization(conv2d(x, 8, 5, 4, "valid")))
        print(x)
        x = tf.nn.elu(
            tf.layers.batch_normalization(conv2d(x, 16, 3, 2, "valid")))
        print(x)
        x = tf.nn.elu(
            tf.layers.batch_normalization(conv2d(x, 32, 3, 2, "valid")))
        print(x)
        x = tf.nn.elu(
            tf.layers.batch_normalization(conv2d(x, 64, 3, 2, "valid")))
        print(x)
        x = tf.layers.flatten(x)
        x = tf.nn.elu(tf.contrib.layers.fully_connected(x, 256))

        return x

    # Inverse Model
    # Given phi(state) and phi(next_state) returns the predicted action ât
    """
    Parameters
    __________
    
    action:   The real action taken by our agent
    phi_state: The feature representation of our state generated by our feature_encoding function.
    phi_next_state: The feature representation of our next_state generated by our feature_encoding function.
    
    returns pred_actions_logits: the logits and pred_actions_prob: the probability distribution of our actions
    """

    def inverse_model(self, phi_state, phi_next_state):
        # Concatenate phi(st) and phi(st+1)
        icm_inv_concatenate = tf.concat([phi_state, phi_next_state], 1)
        icm_inv_fc1 = tf.nn.relu(tf.layers.dense(icm_inv_concatenate, 256))
        pred_actions_logits = tf.layers.dense(icm_inv_fc1, self.action_shape)
        pred_actions_prob = tf.nn.softmax(pred_actions_logits, dim=-1)

        return pred_actions_logits, pred_actions_prob

    # Foward Model
    # Given action and phi(st) must find pred_phi(st+1)
    """
    Parameters
    __________
    
    action:   The action taken by our agent
    phi_state: The feature representation of our state generated by our feature_encoding function.
    phi_next_state: The feature representation of our next_state generated by our feature_encoding function.
    
    returns pred_phi_next_state: The feature representation prediction of our next_state.
    """

    def forward_model(self, action, phi_state):
        # Concatenate phi_state and action
        action = tf.expand_dims(
            action, axis=1)  # Expand dimension to be able to concatenate

        icm_forw_concatenate = tf.concat(axis=1, values=[phi_state, action])

        # FC
        icm_forw_fc1 = tf.layers.dense(icm_forw_concatenate, 256)

        # FC (size of phi_state [1] aka the width) # size of 288
        icm_forw_pred_next_state = tf.layers.dense(
            icm_forw_fc1,
            phi_state.get_shape()[1].value)

        return icm_forw_pred_next_state

    # Calculate intrinsic reward
    """
    Parameters
    __________
    
    phi_next_state: The feature representation of our next_state generated by our feature_encoding function.
    pred_phi_next_state:   The feature representation prediction of our next_state.
    
    
    returns intrinsic_reward: The intrinsic reward
    """

    def calculate_intrinsic_reward(self, state, next_state, action):
        # print("In the error function ")

        sess = tf.get_default_session()
        # print("passed states shape {} {} {} ".format(np.shape(state) , np.shape(next_state) , np.shape(action)))
        # passed states shape (2, 84, 84, 4) (2, 84, 84, 4) (2,)
        # print("action : {} , type {}".format(np.shape(action) , type(action)))
        nenvs = np.shape(state)[0]
        # print("nenvs ",nenvs)
        # tmp = []
        # for i in range(nenvs) :
        #     ac = [action[i]]
        #     tmp.append(sess.run(self.forw_loss,
        #         {self.state_: np.expand_dims(state[i,:,:,:], axis=0), self.next_state_: np.expand_dims(next_state[i,:,:,:],axis=0),
        #         self.action_:  ac } ) )
        # print(" shape passed i {}, state {} , next_state {} , action _type  {} , action {} ".
        # format(i, np.shape(np.expand_dims(state[i,:,:,:] , axis=0)) , np.shape(next_state[i,:,:,:]) ,
        # type(np.array(action[i] )) , np.shape([action[i]]) ) )

        # tmp = np.concatenate([sess.run(self.forw_loss,
        # {self.state_: np.expand_dims(state[i,:,:,:], axis=0),
        # self.next_state_: np.expand_dims(next_state[i,:,:,:],axis=0), self.action_:  [action[i]]}) for i in range(nenvs)] , 0 )
        # print("tmp : ", np.shape(tmp) )
        error = sess.run(self.forw_loss_axis, {
            self.state_: state,
            self.next_state_: next_state,
            self.action_: action
        })
        # print("orignal error  + error with axis -1 ")
        # print(list(zip(tmp,error)))
        # print("orignal Error ",error)
        # error = error * 0.5 #np.dot(error , 0.5)
        # print("Return error ",error)

        # Return intrinsic reward
        return error

    def train_curiosity_model(self, states, next_states,
                              actions):  # , rewards):
        sess = tf.get_default_session()
        feed = {
            self.state_: states,
            self.next_state_: next_states,
            self.action_: actions
        }  #, self.R :rewards }
        if self.idf:
            return sess.run((self.forw_loss, self.inv_loss, self.icm_loss,
                             self._icm_train),
                            feed_dict=feed)
        else:
            return sess.run((self.forw_loss, self.icm_loss, self._icm_train),
                            feed_dict=feed)
Exemple #30
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 model_index=0):
        self.sess = sess = get_session()
        self.model_index = model_index

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        with tf.variable_scope('ppo2_model%s' % model_index,
                               reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model%s' % model_index)
        # print("para",model_index,params)
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_trainable_variables,
                                      scope="ppo2_model%s" % model_index,
                                      sess=sess)
        self.load = functools.partial(load_trainable_variables,
                                      scope="ppo2_model%s" % model_index,
                                      sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        # print("global_variables",model_index,global_variables)
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E1101
Exemple #31
0
    def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale,
                 idf):

        sess = get_session()

        #TODO find a better way
        input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]]

        # input_shape = ob_space
        print("ICM state Input shape ", np.shape(input_shape), "  ",
              input_shape)
        self.action_shape = 36
        self.idf = idf

        # Placeholders

        self.state_ = phi_state = tf.placeholder(tf.float32,
                                                 [None, *input_shape],
                                                 name="icm_state")
        self.next_state_ = phi_next_state = tf.placeholder(
            tf.float32, [None, *input_shape], name="icm_next_state")
        self.action_ = action = tf.placeholder(tf.float32, [None],
                                               name="icm_action")
        # self.R = rewards = tf.placeholder(tf.float32, shape=[None], name="maxR")

        with tf.variable_scope('icm_model'):
            # Feature encoding
            # Aka pass state and next_state to create phi(state), phi(next_state)
            # state --> phi(state)
            print("Feature Encodding of phi state with shape :: ", self.state_)
            phi_state = self.feature_encoding(self.state_)

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                # next_state to phi(next_state)
                phi_next_state = self.feature_encoding(self.next_state_)

            # INVERSE MODEL
            if self.idf:
                pred_actions_logits, pred_actions_prob = self.inverse_model(
                    phi_state, phi_next_state)

            # FORWARD MODEL
            pred_phi_next_state = self.forward_model(action, phi_state)

        # CALCULATE THE ICM LOSS
        # Inverse Loss LI
        # We calculate the cross entropy between our ât and at
        # Squeeze the labels (required)
        labels = tf.cast(action, tf.int32)

        print("prediction pred_actions_logits")
        if self.idf:
            self.inv_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=pred_actions_logits, labels=labels),
                name="inverse_loss")

        # Foward Loss
        # LF = 1/2 || pred_phi_next_state - phi_next_state ||
        # TODO 0.5 * ?
        self.forw_loss_axis = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                             axis=-1,
                                             name="forward_loss_axis")

        self.forw_loss = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                        name="forward_loss")

        # Todo predictor lr scale ?
        # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale
        if self.idf:
            self.icm_loss = ((1 - beta) * self.inv_loss + beta * self.forw_loss
                             )  #* icm_lr_scale
        else:
            self.icm_loss = self.forw_loss

        ####
        # self.icm_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        # print("ICM var list ::: " , self.icm_var_list)
        ####

        #
        # if max_grad_norm is not None :
        # t_icm_grads , _ = tf.clip_by_global_norm(self.icm_loss, constants['GRAD_NORM_CLIP'] )
        # t_icm_grads_and_vars = list(zip(self.icm_loss , self.icm_var_list))
        # print("\n\n\nit works \n\n\n")
        #

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        self.icm_params = tf.trainable_variables(
            'icm_model')  ## var_list same as

        ## testing phase
        self.predgrads = tf.gradients(self.icm_loss, self.icm_params)
        self.predgrads, _ = tf.clip_by_global_norm(self.predgrads,
                                                   max_grad_norm)
        self.pred_grads_and_vars = list(zip(self.predgrads, self.icm_params))

        ## testing phase

        # print("\n\nTrainable variables \n ",icm_params)
        # # 2. Build our trainer
        self.icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=1e-3,
                                            epsilon=1e-5)
        # # 3. Calculate the gradients
        icm_grads_and_var = self.icm_trainer.compute_gradients(
            self.icm_loss, self.icm_params)
        # # t_grads_and_var = tf.gradients()
        icm_grads, icm_var = zip(*icm_grads_and_var)

        if max_grad_norm is not None:
            #     # Clip the gradients (normalize)
            icm_grads, icm__grad_norm = tf.clip_by_global_norm(
                icm_grads, max_grad_norm)
        icm_grads_and_var = list(zip(icm_grads, icm_var))
        # # zip aggregate each gradient with parameters associated
        # # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self._icm_train = self.icm_trainer.apply_gradients(icm_grads_and_var)

        if MPI.COMM_WORLD.Get_rank() == 0:
            print("Initialize")
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        # print("GLOBAL VARIABLES", global_variables)
        sync_from_root(sess, global_variables)  #pylint: disable=E1101