Esempio n. 1
0
 def __init__(self, checkpoint_path):
   player_base.PlayerBase.__init__(self)
   self._action_set = 'default'
   self._player_prefix = 'player_0'
   config = tf.ConfigProto()
   config.gpu_options.allow_growth = True
   self._sess = tf.Session(config=config)
   stacking = 4
   self._stacker = ObservationStacker(stacking)
   with tf.variable_scope(self._player_prefix):
       with tf.variable_scope('ppo2_model'):
           env = DummyEnv(self._action_set, stacking)
           ob_space = env.observation_space
           X = observation_placeholder(ob_space, batch_size=1)
           extra_tensors = {}
           encoded_x = X
           encoded_x = encode_observation(ob_space, encoded_x)
           with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
               policy_latent = gfootball_impala_cnn_network_fn(encoded_x)
           self._policy = PolicyWithValue(
               env=env,
               observations=X,
               latent=policy_latent,
               vf_latent=policy_latent,
               sess=self._sess,
               estimate_q=False,
               **extra_tensors
           )
   _load_variables(checkpoint_path, self._sess, prefix=self._player_prefix + '/')
   saver = tf.train.Saver()
   saver.save(self._sess, "/home/alex/Dropbox/projects/python/kaggle/football/saved_models/11_vs_11_easy_stochastic_v2/11_vs_11_easy_stochastic_v2")
Esempio n. 2
0
 def __init__(self,
              *,
              ac_space,
              policy_network,
              value_network=None,
              ent_coef,
              vf_coef,
              max_grad_norm):
     super(Model, self).__init__(name='PPO2Model')
     self.train_model = PolicyWithValue(ac_space,
                                        policy_network,
                                        value_network,
                                        estimate_q=False)
     if MPI is not None:
         self.optimizer = MpiAdamOptimizer(
             MPI.COMM_WORLD, self.train_model.trainable_variables)
     else:
         self.optimizer = tf.keras.optimizers.Adam()
     self.ent_coef = ent_coef
     self.vf_coef = vf_coef
     self.max_grad_norm = max_grad_norm
     self.step = self.train_model.step
     self.mode = self.train_model.mode
     self.value = self.train_model.value
     self.initial_state = self.train_model.initial_state
     self.loss_names = [
         'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
         'clipfrac'
     ]
     if MPI is not None:
         sync_from_root(self.variables)
Esempio n. 3
0
    def __init__(self,
                 *,
                 ac_space,
                 policy_network,
                 nupdates,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6)):

        super(Model, self).__init__(name='A2CModel')
        self.train_model = PolicyWithValue(ac_space,
                                           policy_network,
                                           value_network=None,
                                           estimate_q=False)
        lr_schedule = InverseLinearTimeDecay(initial_learning_rate=lr,
                                             nupdates=nupdates)
        self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr_schedule,
                                                     rho=alpha,
                                                     epsilon=epsilon)

        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
Esempio n. 4
0
 def __init__(self, *, ac_space, policy_network, value_network=None, ent_coef, vf_coef, max_grad_norm, lr):
     super(Model, self).__init__(name='PPO2Model')
     self.train_model = PolicyWithValue(ac_space, policy_network, value_network, estimate_q=False)
     self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr, epsilon=1e-5)
     self.ent_coef = ent_coef
     self.vf_coef = vf_coef
     self.max_grad_norm = max_grad_norm
     self.step = self.train_model.step
     self.value = self.train_model.value
     self.initial_state = self.train_model.initial_state
     self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
Esempio n. 5
0
class Model(tf.keras.Model):

    """
    We use this class to :
        __init__:
        - Creates the step_model
        - Creates the train_model

        train():
        - Make the training part (feedforward and retropropagation of gradients)

        save/load():
        - Save load the model
    """
    def __init__(self, *, ac_space, policy_network, nupdates,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6)):

        super(Model, self).__init__(name='A2CModel')
        self.train_model = PolicyWithValue(ac_space, policy_network, value_network=None, estimate_q=False)
        lr_schedule = InverseLinearTimeDecay(initial_learning_rate=lr, nupdates=nupdates)
        self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr_schedule, rho=alpha, epsilon=epsilon)

        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state

    @tf.function
    def train(self, obs, states, rewards, masks, actions, values):
        advs = rewards - values
        with tf.GradientTape() as tape:
            policy_latent = self.train_model.policy_network(obs)
            pd, _ = self.train_model.pdtype.pdfromlatent(policy_latent)
            neglogpac = pd.neglogp(actions)
            entropy = tf.reduce_mean(pd.entropy())
            vpred = self.train_model.value(obs)
            vf_loss = tf.reduce_mean(tf.square(vpred - rewards))
            pg_loss = tf.reduce_mean(advs * neglogpac)
            loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

        var_list = tape.watched_variables()
        grads = tape.gradient(loss, var_list)
        grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
        grads_and_vars = list(zip(grads, var_list))
        self.optimizer.apply_gradients(grads_and_vars)

        return pg_loss, vf_loss, entropy
Esempio n. 6
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):
        super(Model, self).__init__(name='ACKTRModel')

        nbatch = nenvs * nsteps

        # TODO: PolicyWithValue does this right? Original implementation uses 'nbatch'
        #self.model = step_model = policy(nenvs, 1)
        #self.model2 = train_model = policy(nbatch, nsteps)
        train_model = PolicyWithValue(ac_space,
                                      policy,
                                      value_network=None,
                                      estimate_q=False)

        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.vf_fisher_coef = vf_fisher_coef
        self.kfac_clip = kfac_clip

        self.is_async = is_async
        self.max_grad_norm = max_grad_norm
        self.total_timesteps = total_timesteps

        # TODO: Learning rate schedule and definition of optimizer
        #self.lrschedule = lrschedule
        lrschedule = LinearTimeDecay(initial_learning_rate=lr)  # TODO
        self.optim = kfac.KfacOptimizer(learning_rate=lrschedule, clip_kl=self.kfac_clip, \
                                        momentum=0.9, kfac_update=1, epsilon=0.01, \
                                        stats_decay=0.99, is_async=self.is_async, cold_iter=10,
                                        max_grad_norm=self.max_grad_norm)

        self.train_model = train_model
        #self.step_model = step_model
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
Esempio n. 7
0
class Player(player_base.PlayerBase):
  """An agent loaded from PPO2 cnn model checkpoint."""

  def __init__(self, checkpoint_path):
    player_base.PlayerBase.__init__(self)
    self._action_set = 'default'
    self._player_prefix = 'player_0'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    self._sess = tf.Session(config=config)
    stacking = 4
    self._stacker = ObservationStacker(stacking)
    with tf.variable_scope(self._player_prefix):
        with tf.variable_scope('ppo2_model'):
            env = DummyEnv(self._action_set, stacking)
            ob_space = env.observation_space
            X = observation_placeholder(ob_space, batch_size=1)
            extra_tensors = {}
            encoded_x = X
            encoded_x = encode_observation(ob_space, encoded_x)
            with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
                policy_latent = gfootball_impala_cnn_network_fn(encoded_x)
            self._policy = PolicyWithValue(
                env=env,
                observations=X,
                latent=policy_latent,
                vf_latent=policy_latent,
                sess=self._sess,
                estimate_q=False,
                **extra_tensors
            )
    _load_variables(checkpoint_path, self._sess, prefix=self._player_prefix + '/')
    saver = tf.train.Saver()
    saver.save(self._sess, "/home/alex/Dropbox/projects/python/kaggle/football/saved_models/11_vs_11_easy_stochastic_v2/11_vs_11_easy_stochastic_v2")

  def __del__(self):
    self._sess.close()

  def take_action(self, observation):
    assert len(observation) == 1, 'Multiple players control is not supported'

    observation = observation_preprocessing.generate_smm(observation)
    observation = self._stacker.get(observation)
    action = self._policy.step(observation)[0][0]
    actions = [action] #[football_action_set.action_set_dict[self._action_set][action]]
    return actions

  def reset(self):
    self._stacker.reset()
Esempio n. 8
0
class Model(tf.Module):
    """
    We use this object to :
    __init__:
    - Creates the step_model
    - Creates the train_model

    train():
    - Make the training part (feedforward and retropropagation of gradients)

    save/load():
    - Save load the model
    """
    def __init__(self,
                 *,
                 ac_space,
                 policy_network,
                 value_network=None,
                 ent_coef,
                 vf_coef,
                 max_grad_norm):
        super(Model, self).__init__(name='PPO2Model')
        self.train_model = PolicyWithValue(ac_space,
                                           policy_network,
                                           value_network,
                                           estimate_q=False)
        if MPI is not None:
            self.optimizer = MpiAdamOptimizer(
                MPI.COMM_WORLD, self.train_model.trainable_variables)
        else:
            self.optimizer = tf.keras.optimizers.Adam()
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.mode = self.train_model.mode
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        if MPI is not None:
            sync_from_root(self.variables)

    def train(self,
              lr,
              cliprange,
              obs,
              returns,
              masks,
              actions,
              values,
              neglogpac_old,
              states=None):
        grads, pg_loss, vf_loss, entropy, approxkl, clipfrac = self.get_grad(
            cliprange, obs, returns, masks, actions, values, neglogpac_old)
        if MPI is not None:
            self.optimizer.apply_gradients(grads, lr)
        else:
            self.optimizer.learning_rate = lr
            grads_and_vars = zip(grads, self.train_model.trainable_variables)
            self.optimizer.apply_gradients(grads_and_vars)

        return pg_loss, vf_loss, entropy, approxkl, clipfrac

    @tf.function
    def get_grad(self, cliprange, obs, returns, masks, actions, values,
                 neglogpac_old):
        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
        # Returns = R + yV(s')
        advs = returns - values

        # Normalize the advantages
        advs = (advs - tf.reduce_mean(advs)) / (tf.keras.backend.std(advs) +
                                                1e-8)

        with tf.GradientTape() as tape:
            policy_latent = self.train_model.policy_network(obs)
            pd, _ = self.train_model.pdtype.pdfromlatent(policy_latent)
            neglogpac = pd.neglogp(actions)
            entropy = tf.reduce_mean(pd.entropy())
            vpred = self.train_model.value(obs)
            vpredclipped = values + tf.clip_by_value(vpred - values,
                                                     -cliprange, cliprange)
            vf_losses1 = tf.square(vpred - returns)
            vf_losses2 = tf.square(vpredclipped - returns)
            vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

            ratio = tf.exp(neglogpac_old - neglogpac)
            pg_losses1 = -advs * ratio
            pg_losses2 = -advs * tf.clip_by_value(ratio, 1 - cliprange,
                                                  1 + cliprange)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))

            approxkl = .5 * tf.reduce_mean(
                tf.square(neglogpac - neglogpac_old))
            clipfrac = tf.reduce_mean(
                tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange),
                        tf.float32))

            loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

        var_list = self.train_model.trainable_variables
        grads = tape.gradient(loss, var_list)
        if self.max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
        if MPI is not None:
            grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0)
        return grads, pg_loss, vf_loss, entropy, approxkl, clipfrac
    def __init__(self, agent, network, nsteps, rho, max_kl, ent_coef,
                 vf_stepsize, vf_iters, cg_damping, cg_iters, seed, load_path,
                 **network_kwargs):
        super(AgentModel, self).__init__(name='MATRPOModel')
        self.agent = agent
        self.nsteps = nsteps
        self.rho = rho
        self.max_kl = max_kl
        self.ent_coef = ent_coef
        self.cg_damping = cg_damping
        self.cg_iters = cg_iters
        self.vf_stepsize = vf_stepsize
        self.vf_iters = vf_iters

        set_global_seeds(seed)

        np.set_printoptions(precision=3)

        if MPI is not None:
            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
        else:
            self.nworkers = 1
            self.rank = 0

        # Setup losses and stuff
        # ----------------------------------------
        ob_space = agent.observation_space
        ac_space = agent.action_space

        if isinstance(network, str):
            network = get_network_builder(network)(**network_kwargs)

        with tf.name_scope(agent.name):
            with tf.name_scope("pi"):
                pi_policy_network = network(ob_space.shape)
                pi_value_network = network(ob_space.shape)
                self.pi = pi = PolicyWithValue(ac_space, pi_policy_network,
                                               pi_value_network)
            with tf.name_scope("oldpi"):
                old_pi_policy_network = network(ob_space.shape)
                old_pi_value_network = network(ob_space.shape)
                self.oldpi = oldpi = PolicyWithValue(ac_space,
                                                     old_pi_policy_network,
                                                     old_pi_value_network)

        self.comm_matrix = agent.comm_matrix.copy()
        self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32)
        self.multipliers = np.zeros([self.agent.nmates,
                                     self.nsteps]).astype(np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

        pi_var_list = pi_policy_network.trainable_variables + list(
            pi.pdtype.trainable_variables)
        old_pi_var_list = old_pi_policy_network.trainable_variables + list(
            oldpi.pdtype.trainable_variables)
        vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables
        old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables

        self.pi_var_list = pi_var_list
        self.old_pi_var_list = old_pi_var_list
        self.vf_var_list = vf_var_list
        self.old_vf_var_list = old_vf_var_list

        if load_path is not None:
            load_path = osp.expanduser(load_path)
            ckpt = tf.train.Checkpoint(model=pi)
            manager = tf.train.CheckpointManager(ckpt,
                                                 load_path,
                                                 max_to_keep=None)
            ckpt.restore(manager.latest_checkpoint)

        self.vfadam = MpiAdam(vf_var_list)

        self.get_flat = U.GetFlat(pi_var_list)
        self.set_from_flat = U.SetFromFlat(pi_var_list)
        self.loss_names = [
            "Lagrange", "surrgain", "sync", "meankl", "entloss", "entropy"
        ]
        self.shapes = [var.get_shape().as_list() for var in pi_var_list]
Esempio n. 10
0
    def policy_fn(nbatch=None,
                  nsteps=None,
                  sess=None,
                  observ_placeholder=None,
                  randomization=True):
        ob_space = env.observation_space

        extra_tensors = {}

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(
            ob_space, batch_size=None)

        encoded_x = encode_observation(ob_space, X)

        # Randomization
        if randomization:
            encoded_x = tf.layers.conv2d(
                encoded_x / 255.,
                3,
                3,
                padding='same',
                kernel_initializer=tf.initializers.glorot_normal(),
                trainable=False,
                name='randcnn') * 255.
            randcnn_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                              scope="ppo2_model/randcnn")
            extra_tensors['randcnn_param'] = randcnn_param

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            extra_tensors['latent_fts'] = policy_latent
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(
                        nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(
                        encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)

        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(env=env,
                                 observations=X,
                                 latent=policy_latent,
                                 vf_latent=vf_latent,
                                 sess=sess,
                                 estimate_q=estimate_q,
                                 **extra_tensors)
        return policy
Esempio n. 11
0
def learn(
        *,
        network,
        env,
        eval_env,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        log_path=None,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    if isinstance(network, str):
        network = get_network_builder(network)(**network_kwargs)

    with tf.name_scope("pi"):
        pi_policy_network = network(ob_space.shape)
        pi_value_network = network(ob_space.shape)
        pi = PolicyWithValue(ac_space, pi_policy_network, pi_value_network)
    with tf.name_scope("oldpi"):
        old_pi_policy_network = network(ob_space.shape)
        old_pi_value_network = network(ob_space.shape)
        oldpi = PolicyWithValue(ac_space, old_pi_policy_network,
                                old_pi_value_network)

    pi_var_list = pi_policy_network.trainable_variables + list(
        pi.pdtype.trainable_variables)
    old_pi_var_list = old_pi_policy_network.trainable_variables + list(
        oldpi.pdtype.trainable_variables)
    vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables
    old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables

    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=pi)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        ckpt.restore(manager.latest_checkpoint)

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(pi_var_list)
    set_from_flat = U.SetFromFlat(pi_var_list)
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
    shapes = [var.get_shape().as_list() for var in pi_var_list]

    def assign_old_eq_new():
        for pi_var, old_pi_var in zip(pi_var_list, old_pi_var_list):
            old_pi_var.assign(pi_var)
        for vf_var, old_vf_var in zip(vf_var_list, old_vf_var_list):
            old_vf_var.assign(vf_var)

    @tf.function
    def compute_lossandgrad(ob, ac, atarg):
        with tf.GradientTape() as tape:
            old_policy_latent = oldpi.policy_network(ob)
            old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent)
            policy_latent = pi.policy_network(ob)
            pd, _ = pi.pdtype.pdfromlatent(policy_latent)
            kloldnew = old_pd.kl(pd)
            ent = pd.entropy()
            meankl = tf.reduce_mean(kloldnew)
            meanent = tf.reduce_mean(ent)
            entbonus = ent_coef * meanent
            ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac))
            surrgain = tf.reduce_mean(ratio * atarg)
            optimgain = surrgain + entbonus
            losses = [optimgain, meankl, entbonus, surrgain, meanent]
        gradients = tape.gradient(optimgain, pi_var_list)
        return losses + [U.flatgrad(gradients, pi_var_list)]

    @tf.function
    def compute_losses(ob, ac, atarg):
        old_policy_latent = oldpi.policy_network(ob)
        old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent)
        policy_latent = pi.policy_network(ob)
        pd, _ = pi.pdtype.pdfromlatent(policy_latent)
        kloldnew = old_pd.kl(pd)
        ent = pd.entropy()
        meankl = tf.reduce_mean(kloldnew)
        meanent = tf.reduce_mean(ent)
        entbonus = ent_coef * meanent
        ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac))
        surrgain = tf.reduce_mean(ratio * atarg)
        optimgain = surrgain + entbonus
        losses = [optimgain, meankl, entbonus, surrgain, meanent]
        return losses

    #ob shape should be [batch_size, ob_dim], merged nenv
    #ret shape should be [batch_size]
    @tf.function
    def compute_vflossandgrad(ob, ret):
        with tf.GradientTape() as tape:
            pi_vf = pi.value(ob)
            vferr = tf.reduce_mean(tf.square(pi_vf - ret))
        return U.flatgrad(tape.gradient(vferr, vf_var_list), vf_var_list)

    @tf.function
    def compute_fvp(flat_tangent, ob, ac, atarg):
        with tf.GradientTape() as outter_tape:
            with tf.GradientTape() as inner_tape:
                old_policy_latent = oldpi.policy_network(ob)
                old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent)
                policy_latent = pi.policy_network(ob)
                pd, _ = pi.pdtype.pdfromlatent(policy_latent)
                kloldnew = old_pd.kl(pd)
                meankl = tf.reduce_mean(kloldnew)
            klgrads = inner_tape.gradient(meankl, pi_var_list)
            start = 0
            tangents = []
            for shape in shapes:
                sz = U.intprod(shape)
                tangents.append(
                    tf.reshape(flat_tangent[start:start + sz], shape))
                start += sz
            gvp = tf.add_n([
                tf.reduce_sum(g * tangent)
                for (g, tangent) in zipsame(klgrads, tangents)
            ])
        hessians_products = outter_tape.gradient(gvp, pi_var_list)
        fvp = U.flatgrad(hessians_products, pi_var_list)
        return fvp

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    logdir = log_path + '/evaluator'
    modeldir = log_path + '/models'
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    if not os.path.exists(modeldir):
        os.makedirs(modeldir)
    evaluator = Evaluator(env=eval_env, model=pi, logdir=logdir)
    max_inner_iter = 500000 if env.spec.id == 'InvertedDoublePendulum-v2' else 3000000
    epoch = vf_iters
    batch_size = timesteps_per_batch
    mb_size = 256
    inner_iter_per_iter = epoch * int(batch_size / mb_size)
    max_iter = int(max_inner_iter / inner_iter_per_iter)
    eval_num = 150
    eval_interval = save_interval = int(
        int(max_inner_iter / eval_num) / inner_iter_per_iter)

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    for update in range(1, max_iter + 1):
        if callback: callback(locals(), globals())
        # if total_timesteps and timesteps_so_far >= total_timesteps:
        #     break
        # elif max_episodes and episodes_so_far >= max_episodes:
        #     break
        # elif max_iters and iters_so_far >= max_iters:
        #     break
        logger.log("********** Iteration %i ************" % iters_so_far)
        if (update - 1) % eval_interval == 0:
            evaluator.run_evaluation(update - 1)
        if (update - 1) % save_interval == 0:
            ckpt = tf.train.Checkpoint(model=pi)
            ckpt.save(modeldir + '/ckpt_ite' + str((update - 1)))

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        ob = sf01(ob)
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = ob, ac, atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs).numpy()) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = g.numpy()
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=mb_size):
                    mbob = sf01(mbob)
                    g = allmean(compute_vflossandgrad(mbob, mbret).numpy())
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()

    return pi
    def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef,
                 max_grad_norm, seed, load_path, **network_kwargs):
        super(AgentModel, self).__init__(name='MAPPO2Model')
        set_global_seeds(seed)
        # Get state_space and action_space
        ob_space = agent.observation_space
        ac_space = agent.action_space

        if isinstance(network, str):
            network_type = network
            policy_network_fn = get_network_builder(network_type)(
                **network_kwargs)
            network = policy_network_fn(ob_space.shape)

        self.train_model = PolicyWithValue(ac_space, network)
        if MPI is not None:
            self.optimizer = MpiAdamOptimizer(
                MPI.COMM_WORLD, self.train_model.trainable_variables)
        else:
            self.optimizer = tf.keras.optimizers.Adam()

        # if isinstance(network, str):
        #     network = get_network_builder(network)(**network_kwargs)
        # policy_network = network(ob_space.shape)
        # value_network = network(ob_space.shape)
        # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network)
        # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables)
        # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables

        # if MPI is not None:
        #     self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list)
        #     self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list)
        # else:
        #     self.pi_optimizer = tf.keras.optimizers.Adam()
        #     self.vf_optimizer = tf.keras.optimizers.Adam()
        self.agent = agent
        self.nsteps = nsteps
        self.rho = rho
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
        self.loss_names = [
            'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss',
            'policy_entropy', 'approxkl', 'clipfrac'
        ]
        if MPI is not None:
            sync_from_root(self.variables)

        self.comm_matrix = agent.comm_matrix.copy()
        self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32)
        self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

        if load_path is not None:
            load_path = osp.expanduser(load_path)
            ckpt = tf.train.Checkpoint(model=self.train_model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 load_path,
                                                 max_to_keep=None)
            ckpt.restore(manager.latest_checkpoint)
class AgentModel(tf.Module):
    def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef,
                 max_grad_norm, seed, load_path, **network_kwargs):
        super(AgentModel, self).__init__(name='MAPPO2Model')
        set_global_seeds(seed)
        # Get state_space and action_space
        ob_space = agent.observation_space
        ac_space = agent.action_space

        if isinstance(network, str):
            network_type = network
            policy_network_fn = get_network_builder(network_type)(
                **network_kwargs)
            network = policy_network_fn(ob_space.shape)

        self.train_model = PolicyWithValue(ac_space, network)
        if MPI is not None:
            self.optimizer = MpiAdamOptimizer(
                MPI.COMM_WORLD, self.train_model.trainable_variables)
        else:
            self.optimizer = tf.keras.optimizers.Adam()

        # if isinstance(network, str):
        #     network = get_network_builder(network)(**network_kwargs)
        # policy_network = network(ob_space.shape)
        # value_network = network(ob_space.shape)
        # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network)
        # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables)
        # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables

        # if MPI is not None:
        #     self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list)
        #     self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list)
        # else:
        #     self.pi_optimizer = tf.keras.optimizers.Adam()
        #     self.vf_optimizer = tf.keras.optimizers.Adam()
        self.agent = agent
        self.nsteps = nsteps
        self.rho = rho
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
        self.loss_names = [
            'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss',
            'policy_entropy', 'approxkl', 'clipfrac'
        ]
        if MPI is not None:
            sync_from_root(self.variables)

        self.comm_matrix = agent.comm_matrix.copy()
        self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32)
        self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

        if load_path is not None:
            load_path = osp.expanduser(load_path)
            ckpt = tf.train.Checkpoint(model=self.train_model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 load_path,
                                                 max_to_keep=None)
            ckpt.restore(manager.latest_checkpoint)

    def reinitial_estimates(self):
        self.estimates = np.random.normal(
            0, 0.1, [self.agent.nmates, self.nsteps]).astype(np.float32)
        self.multipliers = np.random.uniform(
            0, 1, [self.agent.nmates, self.nsteps]).astype(np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

    def store_oldpi_var(self):
        pi_var_list = self.train_model.policy_network.trainable_variables + \
                      list(self.train_model.pdtype.trainable_variables)
        self.oldpi_var_list = [var.numpy() for var in pi_var_list]

    def assign_new_eq_old(self):
        pi_var_list = self.train_model.policy_network.trainable_variables + \
                      list(self.train_model.pdtype.trainable_variables)
        for pi_var, old_pi_var in zip(pi_var_list, self.oldpi_var_list):
            pi_var.assign(old_pi_var)

    # @tf.function
    # def get_vf_grad(self, cliprange, obs, returns, actions, values, advs, neglogpac_old):
    #     with tf.GradientTape() as tape:
    #         vpred = self.train_model.value(obs)
    #         vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange)
    #         vf_losses1 = tf.square(vpred - returns)
    #         vf_losses2 = tf.square(vpredclipped - returns)
    #         vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

    #     vf_grads = tape.gradient(vf_loss, self.vf_var_list)
    #     if self.max_grad_norm is not None:
    #         vf_grads, _ = tf.clip_by_global_norm(vf_grads, self.max_grad_norm)
    #     if MPI is not None:
    #         vf_grads = tf.concat([tf.reshape(g, (-1,)) for g in vf_grads], axis=0)

    #     return vf_grads, vf_loss

    @tf.function
    def get_pi_grad(self, cliprange, nb, estimates, multipliers, obs, returns,
                    actions, values, advs, neglogpac_old):
        with tf.GradientTape() as tape:
            policy_latent = self.train_model.policy_network(obs)
            pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent)
            neglogpac = pd.neglogp(actions)
            entropy = tf.reduce_mean(pd.entropy())

            vpred = self.train_model.value(obs)
            vpredclipped = values + tf.clip_by_value(vpred - values,
                                                     -cliprange, cliprange)
            vf_losses1 = tf.square(vpred - returns)
            vf_losses2 = tf.square(vpredclipped - returns)
            vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

            ratio = tf.exp(neglogpac_old - neglogpac)
            clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange,
                                             1 + cliprange)
            pg_losses1 = -advs * ratio
            pg_losses2 = -advs * clipped_ratio
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))

            comm = self.comm_matrix[self.comm_matrix[:,
                                                     nb] != 0][0,
                                                               self.agent.id]
            syncerr = comm * ratio - estimates
            sync_loss = tf.reduce_mean(multipliers * syncerr) + \
                        0.5 * self.rho * (tf.reduce_mean(tf.square(syncerr)))

            approxkl = .5 * tf.reduce_mean(
                tf.square(neglogpac - neglogpac_old))
            clipfrac = tf.reduce_mean(
                tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange),
                        tf.float32))

            loss = pg_loss + sync_loss - entropy * self.ent_coef + vf_loss * self.vf_coef

        var_list = self.train_model.trainable_variables
        grads = tape.gradient(loss, var_list)
        if self.max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
        if MPI is not None:
            grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0)
        return grads, loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac

        # pi_grads = tape.gradient(pi_loss, self.pi_var_list)
        # if self.max_grad_norm is not None:
        #     pi_grads, _ = tf.clip_by_global_norm(pi_grads, self.max_grad_norm)
        # if MPI is not None:
        #     pi_grads = tf.concat([tf.reshape(g, (-1,)) for g in pi_grads], axis=0)
        # return pi_grads, pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac

    def pi_update(self, lr, cliprange, nb, obs, returns, actions, values, advs,
                  neglogpacs_old):
        estimates = self.estimates[nb]
        multipliers = self.multipliers[nb]
        pi_grads, pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac = self.get_pi_grad(
            cliprange, nb, estimates, multipliers, obs, returns, actions,
            values, advs, neglogpacs_old)

        if MPI is not None:
            self.optimizer.apply_gradients(pi_grads, lr)
        else:
            self.optimizer.learning_rate = lr
            grads_and_vars = zip(pi_grads,
                                 self.train_model.trainable_variables)
            self.optimizer.apply_gradients(grads_and_vars)

        return pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac

        # if MPI is not None:
        #     self.pi_optimizer.apply_gradients(pi_grads, lr)
        # else:
        #     self.pi_optimizer.learning_rate = lr
        #     grads_and_vars = zip(pi_grads, self.pi_var_list)
        #     self.pi_optimizer.apply_gradients(grads_and_vars)

        # return pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac

    # def vf_update(self, lr, cliprange, obs, returns, actions, values, advs, neglogpacs_old):
    #     vf_grads, vf_loss = self.get_vf_grad(
    #         cliprange, obs, returns, actions, values, advs, neglogpacs_old)
    #     if MPI is not None:
    #         self.vf_optimizer.apply_gradients(vf_grads, lr)
    #     else:
    #         self.vf_optimizer.learning_rate = lr
    #         grads_and_vars = zip(vf_grads, self.train_model.trainable_variables)
    #         self.vf_optimizer.apply_gradients(grads_and_vars)

    #     return vf_loss

    def info_to_exchange(self, cliprange, ob, ac, neglogpac_old, nb):
        policy_latent = self.train_model.policy_network(ob)
        pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent)
        neglogpac = pd.neglogp(ac)
        ratio = tf.exp(neglogpac_old - neglogpac)
        clipped_ratio = tf.clip_by_value(tf.exp(-neglogpac), 1 - cliprange,
                                         1 + cliprange)

        return ratio, self.multipliers[nb]

    def exchange(self, cliprange, ob, ac, neglogpac_old, nb_ratio,
                 nb_multipliers, nb):
        policy_latent = self.train_model.policy_network(ob)
        pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent)
        neglogpac = pd.neglogp(ac)
        ratio = tf.exp(neglogpac_old - neglogpac)
        clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange)
        comm = self.comm_matrix[self.comm_matrix[:, nb] != 0][0, self.agent.id]

        v = 0.5 * (self.multipliers[nb] + nb_multipliers) + \
            0.5 * self.rho * (comm * ratio + (-comm) * nb_ratio)
        estimate = np.array((1.0 / self.rho) * (self.multipliers[nb] - v) +
                            comm * ratio)

        self.estimates = tf.tensor_scatter_nd_update(self.estimates, [[nb]],
                                                     estimate[None, :])
        self.multipliers = tf.tensor_scatter_nd_update(self.multipliers,
                                                       [[nb]], v[None, :])
Esempio n. 14
0
    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None,
            mix_mode='nomix'):
        ob_space = env.observation_space

        extra_tensors = {}

        X = observ_placeholder if observ_placeholder is not None \
                else observation_placeholder(ob_space, batch_size=None)

        if mix_mode in ['mixreg', 'mixobs']:
            COEFF = tf.placeholder(tf.float32, [None])
            INDICES = tf.placeholder(tf.int32, [None])
            OTHER_INDICES = tf.placeholder(tf.int32, [None])
            coeff = tf.reshape(COEFF, (-1, 1, 1, 1))
            encoded_x = tf.cast(X, tf.float32)
            encoded_x = coeff * tf.gather(encoded_x, INDICES, axis=0) \
                    + (1 - coeff) * tf.gather(encoded_x, OTHER_INDICES, axis=0)
            encoded_x = tf.cast(encoded_x, tf.uint8)
            extra_tensors['coeff'] = COEFF
            extra_tensors['indices'] = INDICES
            extra_tensors['other_indices'] = OTHER_INDICES
        elif mix_mode == 'nomix':
            encoded_x = X
        else:
            raise ValueError(f"Unknown mixing mode: {mix_mode} !")

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch ' \
                            +'size {} smaller than nsteps {}'.format(
                                    nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(
                            encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)


        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with
                # value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(
            env=env,
            observations=X,
            latent=policy_latent,
            vf_latent=vf_latent,
            sess=sess,
            estimate_q=estimate_q,
            # JAG: Pass adv_gamma to policy
            adv_gamma=adv_gamma,
            **extra_tensors
        )
        return policy
Esempio n. 15
0
def learn(
        *,
        network,
        env,
        save,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # ttotal_timestepsime constraint
        callback=None,
        load_path=None,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    if isinstance(network, str):
        network, network_model = get_network_builder(network)(**network_kwargs)

    with tf.name_scope("pi"):
        pi_policy_network = network(ob_space.shape)
        pi_value_network = network(ob_space.shape)
        pi = PolicyWithValue(ac_space, pi_policy_network, pi_value_network)
    with tf.name_scope("oldpi"):
        old_pi_policy_network = network(ob_space.shape)
        old_pi_value_network = network(ob_space.shape)
        oldpi = PolicyWithValue(ac_space, old_pi_policy_network,
                                old_pi_value_network)

    pi_var_list = pi_policy_network.trainable_variables + list(
        pi.pdtype.trainable_variables)
    old_pi_var_list = old_pi_policy_network.trainable_variables + list(
        oldpi.pdtype.trainable_variables)
    vf_var_list = pi_value_network.trainable_variables + pi.value_fc.trainable_variables
    old_vf_var_list = old_pi_value_network.trainable_variables + oldpi.value_fc.trainable_variables

    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=pi)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        ckpt.restore(manager.latest_checkpoint)

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(pi_var_list)
    set_from_flat = U.SetFromFlat(pi_var_list)
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
    shapes = [var.get_shape().as_list() for var in pi_var_list]

    def assign_old_eq_new():
        for pi_var, old_pi_var in zip(pi_var_list, old_pi_var_list):
            old_pi_var.assign(pi_var)
        for vf_var, old_vf_var in zip(vf_var_list, old_vf_var_list):
            old_vf_var.assign(vf_var)

    @tf.function
    def compute_lossandgrad(ob, ac, atarg):
        with tf.GradientTape() as tape:
            old_policy_latent = oldpi.policy_network(ob)
            old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent)
            policy_latent = pi.policy_network(ob)
            pd, _ = pi.pdtype.pdfromlatent(policy_latent)
            kloldnew = old_pd.kl(pd)
            ent = pd.entropy()
            meankl = tf.reduce_mean(kloldnew)
            meanent = tf.reduce_mean(ent)
            entbonus = ent_coef * meanent
            ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac))
            surrgain = tf.reduce_mean(ratio * atarg)
            optimgain = surrgain + entbonus
            losses = [optimgain, meankl, entbonus, surrgain, meanent]
        gradients = tape.gradient(optimgain, pi_var_list)
        return losses + [U.flatgrad(gradients, pi_var_list)]

    @tf.function
    def compute_losses(ob, ac, atarg):
        old_policy_latent = oldpi.policy_network(ob)
        old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent)
        policy_latent = pi.policy_network(ob)
        pd, _ = pi.pdtype.pdfromlatent(policy_latent)
        kloldnew = old_pd.kl(pd)
        ent = pd.entropy()
        meankl = tf.reduce_mean(kloldnew)
        meanent = tf.reduce_mean(ent)
        entbonus = ent_coef * meanent
        ratio = tf.exp(pd.logp(ac) - old_pd.logp(ac))
        surrgain = tf.reduce_mean(ratio * atarg)
        optimgain = surrgain + entbonus
        losses = [optimgain, meankl, entbonus, surrgain, meanent]
        return losses

    #ob shape should be [batch_size, ob_dim], merged nenv
    #ret shape should be [batch_size]
    @tf.function
    def compute_vflossandgrad(ob, ret):
        with tf.GradientTape() as tape:
            pi_vf = pi.value(ob)
            vferr = tf.reduce_mean(tf.square(pi_vf - ret))
        return U.flatgrad(tape.gradient(vferr, vf_var_list), vf_var_list)

    @tf.function
    def compute_fvp(flat_tangent, ob, ac, atarg):
        with tf.GradientTape() as outter_tape:
            with tf.GradientTape() as inner_tape:
                old_policy_latent = oldpi.policy_network(ob)
                old_pd, _ = oldpi.pdtype.pdfromlatent(old_policy_latent)
                policy_latent = pi.policy_network(ob)
                pd, _ = pi.pdtype.pdfromlatent(policy_latent)
                kloldnew = old_pd.kl(pd)
                meankl = tf.reduce_mean(kloldnew)
            klgrads = inner_tape.gradient(meankl, pi_var_list)
            start = 0
            tangents = []
            for shape in shapes:
                sz = U.intprod(shape)
                tangents.append(
                    tf.reshape(flat_tangent[start:start + sz], shape))
                start += sz
            gvp = tf.add_n([
                tf.reduce_sum(g * tangent)
                for (g, tangent) in zipsame(klgrads, tangents)
            ])
        hessians_products = outter_tape.gradient(gvp, pi_var_list)
        fvp = U.flatgrad(hessians_products, pi_var_list)
        return fvp

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    # ---------------------- New ----------------------
    rewforbuffer = deque(maxlen=40)
    rewctrlbuffer = deque(maxlen=40)
    rewconbuffer = deque(maxlen=40)
    rewsurbuffer = deque(maxlen=40)

    rewformeanbuf = np.array([])
    rewctrlmeanbuf = np.array([])
    rewconmeanbuf = np.array([])
    rewsurmeanbuf = np.array([])
    # -------------------------------------------------

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # nothing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    x_axis = 0
    x_holder = np.array([])
    rew_holder = np.array([])
    while True:
        if timesteps_so_far > total_timesteps - 1500:  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            # Set recording XXXX timesteps before ending
            env = VecVideoRecorder(env,
                                   osp.join(logger.get_dir(), "videos"),
                                   record_video_trigger=lambda x: True,
                                   video_length=200)
            seg_gen = traj_segment_generator(pi, env, timesteps_per_batch)

        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        ob = sf01(ob)
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = ob, ac, atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs).numpy()) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = g.numpy()
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    mbob = sf01(mbob)
                    g = allmean(compute_vflossandgrad(mbob, mbret).numpy())
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_rets_for"],
                   seg["ep_rets_ctrl"], seg["ep_rets_con"], seg["ep_rets_sur"]
                   )  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews, rews_for, rews_ctrl, rews_con, rews_sur = map(
            flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        # ---------------------- New ----------------------
        rewforbuffer.extend(rews_for)
        rewctrlbuffer.extend(rews_ctrl)
        rewconbuffer.extend(rews_con)
        rewsurbuffer.extend(rews_sur)

        rewformeanbuf = np.append([rewformeanbuf], [np.mean(rewforbuffer)])
        rewctrlmeanbuf = np.append([rewctrlmeanbuf], [np.mean(rewctrlbuffer)])
        rewconmeanbuf = np.append([rewconmeanbuf], [np.mean(rewconbuffer)])
        rewsurmeanbuf = np.append([rewsurmeanbuf], [np.mean(rewsurbuffer)])
        # -------------------------------------------------

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()

        x_axis += 1
        x_holder = np.append([x_holder], [x_axis])
        rew_holder = np.append([rew_holder], [np.mean(rewbuffer)])

    # --------------------------------------- NEW -----------------------------------------------------
    with open("img_rec.txt", "r") as rec:
        cur_gen = rec.read()
        cur_gen = cur_gen.strip()  # remove \n

    dir_of_gens = [
        '1_1', '2_1', '3_1', '1_2', '2_2', '3_2', '1_3', '2_3', '3_3', '1_4',
        '2_4', '3_4', '1_5', '2_5', '3_5', '1_6', '2_6', '3_6', '1_7', '2_7',
        '3_7', '1_8', '2_8', '3_8', '1_9', '2_9', '3_9', '1_10', '2_10',
        '3_10', '1_11', '2_11', '3_11', '1_12', '2_12', '3_12'
    ]
    # -------------------------------------------------------------------------------------------------

    from matplotlib import pyplot as plt
    f = plt.figure(1)
    plt.plot(x_holder, rew_holder)
    plt.title("Rewards for Ant v2")
    plt.grid(True)
    plt.savefig('rewards_for_antv2_{}'.format(cur_gen))

    g = plt.figure(2)
    plt.plot(x_holder, rewformeanbuf, label='Forward Reward')
    plt.plot(x_holder, rewctrlmeanbuf, label='CTRL Cost')
    plt.plot(x_holder, rewconmeanbuf, label='Contact Cost')
    plt.plot(x_holder, rewsurmeanbuf, label='Survive Reward')
    plt.title("Reward Breakdown")
    plt.legend()
    plt.grid(True)
    plt.savefig('rewards_breakdown{}'.format(cur_gen))

    # plt.show()

    # --------------------------------------- NEW -----------------------------------------------------
    elem = int(dir_of_gens.index(cur_gen))
    with open("img_rec.txt", "w") as rec:
        if elem == 35:
            new_elem = 0
        else:
            new_elem = elem + 1
        new_gen = cur_gen.replace(cur_gen, dir_of_gens[new_elem])
        rec.write(new_gen)
    # -------------------------------------------------------------------------------------------------

    #----------------------------------------------------------- SAVE WEIGHTS ------------------------------------------------------------#
    # np.save('val_weights_bias_2_c',val_weights_bias_2_c) # <-------------------------------------------------------------------------------------
    # save = save.replace(save[0],'..',2)
    # os.chdir(save)
    # name = 'max_reward'
    # completeName = os.path.join(name+".txt")
    # file1 = open(completeName,"w")
    # toFile = str(np.mean(rewbuffer))
    # file1.write(toFile)
    # file1.close()
    # os.chdir('../../../baselines-tf2')

    return pi