예제 #1
0
def demonstrate(network,
                env,
                nsteps,
                mvs,
                load_path,
                ent_coef=0.0,
                vf_coef=0.5,
                max_grad_norm=0.5,
                mpi_rank_weight=1,
                comm=None,
                gamma=0.99,
                lam=0.95):

    policy = build_policy(env, network)

    model = Model(policy=policy,
                  nbatch_act=1,
                  nbatch_train=None,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm,
                  comm=comm,
                  mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
        print('Model has been successfully loaded from {0}'.format(load_path))
    else:
        print(
            'No model has been loaded. Neural network with random weights is used.'
        )

    # Instantiate the runner object and episode buffer

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    mvs=mvs)
    obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        render=True)

    print('Demo completed! Reward: {0}'.format(epinfos[0]['r']))
    print('\nPress Ctrl+C to stop the demo...')
예제 #2
0
    def train_value(self, env, env_type, nupdates, minibatch_size=64):
        from baselines.ppo2.runner import Runner
        import baselines.ppo2.defaults as defaults

        if env_type == 'mujoco':
            params = defaults.mujoco()
        elif env_type == 'atari':
            params = defaults.atari()
        else:
            assert False

        runner = Runner(env=env,
                        model=self.model,
                        nsteps=params['nsteps'],
                        gamma=params['gamma'],
                        lam=params['lam'])

        for update in tqdm(range(1, nupdates + 1), dynamic_ncols=True):
            frac = 1.0 - (update - 1.0) / nupdates
            cliprangenow = params['cliprange'](frac)
            # Get minibatch
            obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
            )  #pylint: disable=E0632

            length = len(obs)
            losses = []
            for _ in range(params['noptepochs']):
                inds = np.random.permutation(length)

                for s in range(0, length, minibatch_size):
                    mbinds = inds[s:s + minibatch_size]

                    with self.graph.as_default():
                        loss, _ = self.sess.run(
                            [self.vf_loss, self.value_update_op],
                            feed_dict={
                                self.inp: obs[mbinds],
                                self.R: returns[mbinds],
                                self.OLDVPRED: values[mbinds],
                                self.CLIPRANGE: cliprangenow
                            })
                        losses.append(loss)
            tqdm.write(('loss: %f') % (np.mean(losses)))
예제 #3
0
 def make_eval_runner(difficulty):
     vec_env = SubprocVecEnv(
         [(lambda _i=i: create_single_football_env(_i, difficulty))
          for i in range(FLAGS.num_env, 2 * FLAGS.num_env)],
         context=None)
     print('vec env obs space', vec_env.observation_space)
     return env, Runner(env=vec_env,
                        model=model,
                        nsteps=nsteps,
                        gamma=gamma,
                        lam=lam)
예제 #4
0
    def make_runner(difficulty):
        def create_single_football_env(iprocess):
            """Creates gfootball environment."""
            env = football_env.create_environment(
                env_name=builder_with_difficulty(difficulty), stacked=('stacked' in FLAGS.state),
                rewards=FLAGS.reward_experiment,
                logdir=logger.get_dir(),
                write_goal_dumps=FLAGS.dump_scores and (iprocess == 0),
                write_full_episode_dumps=FLAGS.dump_full_episodes and (iprocess == 0),
                render=FLAGS.render and (iprocess == 0),
                dump_frequency=50 if FLAGS.render and iprocess == 0 else 0)
            env = monitor.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(),
                                                                        str(iprocess)))
            return env

        env = football_env.create_environment(, 
            stacked=False, logdir='/tmp/football', write_goal_dumps=True, 
            write_full_episode_dumps=False, render=False)

        vec_env = SubprocVecEnv([
            (lambda _i=i: create_single_football_env(_i))
            for i in range(FLAGS.num_envs)
        ], context=None)
        return Runner(env=vec_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) 
예제 #5
0
class Model(object):
    def __init__(self,
                 *,
                 network,
                 env,
                 lr=3e-4,
                 cliprange=0.2,
                 nsteps=128,
                 nminibatches=4,
                 noptepochs=4,
                 ent_coef=0.0,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 gamma=0.99,
                 lam=0.95,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 load_path=None,
                 **network_kwargs):
        """
        Parameters:
        ----------

        network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                          specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                          tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                          neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                          See common/models.py/lstm for more details on using recurrent nets in policies.py

        env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                          The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


        lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                          training and 0 is the end of the training.

        cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                          and 0 is the end of the training

        nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                          nenv is number of environment copies simulated in parallel)


        nminibatches: int                 number of training minibatches per update. For recurrent policies.py,
                                          should be smaller or equal than number of environments run in parallel.

        noptepochs: int                   number of training epochs per update

        ent_coef: float                   policy entropy coefficient in the optimization objective

        vf_coef: float                    value function loss coefficient in the optimization objective

        gamma: float                      discounting factor

        lam: float                        advantage estimation discounting factor (lambda in the paper)

        log_interval: int                 number of timesteps between logging events

        load_path: str                    path to load the model from

        **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network
                                          For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

        """

        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        policy = build_policy(env, network, **network_kwargs)

        self.env = env

        if isinstance(lr, float):
            self.lr = constfn(lr)
        else:
            assert callable(lr)
        if isinstance(cliprange, float):
            self.cliprange = constfn(cliprange)
        else:
            assert callable(cliprange)
        self.nminibatches = nminibatches

        # if eval_env is not None:
        #     eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

        # Calculate the batch_size
        self.nenvs = self.env.num_envs
        self.nsteps = nsteps
        self.nbatch = self.nenvs * self.nsteps
        self.nbatch_train = self.nbatch // nminibatches
        self.noptepochs = noptepochs

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(self.nenvs, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(self.nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder(
            [None])  # action placeholder
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0),
                                   CLIPRANGE)))  # ratio 裁剪量

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS

        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.def_path_pre = os.path.dirname(
            os.path.abspath(__file__)) + '/tmp/'

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  # pylint: disable=E1101

        if load_path is not None:
            self.load_newest(load_path)

        # Instantiate the runner object
        self.runner = Runner(env=self.env,
                             model=self,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    def train(self,
              lr,
              cliprange,
              obs,
              returns,
              masks,
              actions,
              values,
              neglogpacs,
              states=None):
        # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
        # Returns = R + yV(s')
        advs = returns - values

        # Normalize the advantages
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        td_map = {
            self.train_model.X: obs,
            self.A: actions,
            self.ADV: advs,
            self.R: returns,
            self.LR: lr,
            self.CLIPRANGE: cliprange,
            self.OLDNEGLOGPAC: neglogpacs,
            self.OLDVPRED: values
        }
        if states is not None:
            td_map[self.train_model.S] = states
            td_map[self.train_model.M] = masks

        return self.sess.run(self.stats_list + [self._train_op], td_map)[:-1]

    def learn(self,
              total_timesteps,
              seed=None,
              log_interval=10,
              save_interval=10):

        set_global_seeds(seed)
        total_timesteps = int(total_timesteps)

        # Calculate the batch_size
        is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

        epinfobuf = deque(maxlen=100)
        # if eval_env is not None:
        #     eval_epinfobuf = deque(maxlen=100)

        # Start total timer
        tfirststart = time.perf_counter()

        for update in range(1, total_timesteps):
            assert self.nbatch % self.nminibatches == 0
            # Start timer
            tstart = time.perf_counter()
            frac = 1.0 - (update - 1.0) / total_timesteps
            # Calculate the learning rate
            lrnow = self.lr(frac)
            # Calculate the cliprange
            cliprangenow = self.cliprange(frac)

            if update % log_interval == 0 and is_mpi_root:
                logger.info('Stepping environment...')

            # Get minibatch
            obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.runner.run(
            )  # pylint: disable=E0632
            # if eval_env is not None:
            #     eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run()  # pylint: disable=E0632

            if update % log_interval == 0 and is_mpi_root:
                logger.info('Done.')

            epinfobuf.extend(epinfos)
            # if eval_env is not None:
            #     eval_epinfobuf.extend(eval_epinfos)

            # Here what we're going to do is for each minibatch calculate the loss and append it.
            mblossvals = []
            if states is None:  # nonrecurrent version
                # Index of each element of batch_size
                # Create the indices array
                inds = np.arange(self.nbatch)
                for _ in range(self.noptepochs):
                    # Randomize the indexes
                    np.random.shuffle(inds)
                    # 0 to batch_size with batch_train_size step
                    for start in range(0, self.nbatch, self.nbatch_train):
                        end = start + self.nbatch_train
                        mbinds = inds[start:end]
                        slices = (arr[mbinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mblossvals.append(
                            self.train(lrnow, cliprangenow, *slices))
            else:  # recurrent version
                assert self.nenvs % self.nminibatches == 0
                envsperbatch = self.nenvs // self.nminibatches
                envinds = np.arange(self.nenvs)
                flatinds = np.arange(self.nenvs * self.nsteps).reshape(
                    self.nenvs, self.nsteps)
                for _ in range(self.noptepochs):
                    np.random.shuffle(envinds)
                    for start in range(0, self.nenvs, envsperbatch):
                        end = start + envsperbatch
                        mbenvinds = envinds[start:end]
                        mbflatinds = flatinds[mbenvinds].ravel()
                        slices = (arr[mbflatinds]
                                  for arr in (obs, returns, masks, actions,
                                              values, neglogpacs))
                        mbstates = states[mbenvinds]
                        mblossvals.append(
                            self.train(lrnow, cliprangenow, *slices, mbstates))

            # Feedforward --> get losses --> update
            lossvals = np.mean(mblossvals, axis=0)
            # End timer
            tnow = time.perf_counter()
            # Calculate the fps (frame per second)
            fps = int(self.nbatch / (tnow - tstart))

            if update % log_interval == 0 or update == 1:
                # Calculates if value function is a good predicator of the returns (ev > 1)
                # or if it's just worse than predicting nothing (ev =< 0)
                ev = explained_variance(values, returns)
                logger.record_tabular("misc/serial_timesteps",
                                      update * self.nsteps)
                logger.record_tabular("misc/nupdates", update)
                logger.record_tabular("misc/total_timesteps",
                                      update * self.nbatch)
                logger.record_tabular("fps", fps)
                logger.record_tabular("misc/explained_variance", float(ev))
                logger.record_tabular(
                    'eprewmean',
                    safe_mean([epinfo['r'] for epinfo in epinfobuf]))
                logger.record_tabular(
                    'eplenmean',
                    safe_mean([epinfo['l'] for epinfo in epinfobuf]))
                # if eval_env is not None:
                #   logger.record_tabular('eval_eprewmean', safe_mean([epinfo['r'] for epinfo in eval_epinfobuf]))
                #   logger.record_tabular('eval_eplenmean', safe_mean([epinfo['l'] for epinfo in eval_epinfobuf]))
                logger.record_tabular('misc/time_elapsed', tnow - tfirststart)
                for (lossval, lossname) in zip(lossvals, self.loss_names):
                    logger.record_tabular('loss/' + lossname, lossval)

                if is_mpi_root:
                    logger.dump_tabular()

            if save_interval and (update % save_interval == 0
                                  or update == 1) and is_mpi_root:
                file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S',
                                          time.localtime(time.time()))
                model_save_path = self.def_path_pre + file_name
                self.save(model_save_path)

        return self

    def save(self, save_path=None):
        save_variables(save_path=save_path, sess=self.sess)
        print('save model variables to', save_path)

    def load_newest(self, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(
            key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)))
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[-1])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)

    def load_index(self, index, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(
            key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)),
            reverse=True)
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[index])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)
예제 #6
0
파일: ppo2.py 프로젝트: Theling/baselines
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

    return model
예제 #7
0
def learn(*, network, env, total_timesteps, eval_env = None,
            seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)
                                      Daniel: should be `T` in the paper. Atari defaults are 128 as in the paper.

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective
                                      Daniel: 0.5 by default but the PPO paper uses 1.0 for Atari games.

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update
                                      Daniel: 4 by default but the PPO paper uses 3 (for Atari games), etc.

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training. Daniel: for `tf.clip_by_value(ratio, 1-cliprange, 1+cliprange)`

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
    '''
    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    # Daniel: hacky within PPO2 solution for limiting action ranges.
    if 'limit_act_range' in network_kwargs:
        limit_act_range = network_kwargs['limit_act_range']
        network_kwargs.pop('limit_act_range')
    else:
        limit_act_range = False

    policy = build_policy(env, network, limit_act_range=limit_act_range, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space,
                     nbatch_act=nenvs, nbatch_train=nbatch_train,
                     nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm)

    if load_path is not None:
        logger.info("\nInside ppo2, loading model from: {}".format(load_path))
        model.load(load_path)

    # Daniel: debugging and sanity checks
    _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    tf_util.display_var_info(_variables)

    # Instantiate the runner object (Daniel: calls `env.reset()` so can take a while for cloth)
    # Also, I'm going to assume that if total_timesteps=0 then we don't waste time creating this.
    if total_timesteps > 0:
        runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()
    nupdates = total_timesteps//nbatch

    # Daniel: debugging and sanity checks
    logger.info("\nInside ppo2, before updates (`env.reset()` called before this)")
    logger.info("  nsteps: {}, each env in VecEnv does this many to get minibatch".format(nsteps))
    logger.info("  nbatch: {}, i.e., nsteps * nenv, size of data from (get_minibatch)".format(nbatch))
    logger.info("  nbatch_train: {}, batch size for actual gradient update within epoch".format(nbatch_train))
    logger.info("  noptepochs: {}, number of epochs over collected minibatch for PPO updates".format(noptepochs))
    logger.info("  nupdates: {}, number of (get_minibatch, update_net) cycles".format(nupdates))
    logger.info("  our model_fn class: {}".format(model_fn))
    logger.info("(end of debugging messages)\n")

    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos, ep_all_infos = runner.run() #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos, eval_ep_all_infos = eval_runner.run() #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            logger.info('Saving model checkpoint to: ', savepath)
            model.save(savepath)
            # ------------------------------------------------------------------
            # Daniel: extra stuff for debugging PPO on cloth, actions and infos for each episode.
            logstd_vals = model.act_model.get_logstd_values()
            action_dir  = osp.join(logger.get_dir(), 'actions')
            episode_dir = osp.join(logger.get_dir(), 'ep_all_infos')
            logstd_dir  = osp.join(logger.get_dir(), 'logstd')
            os.makedirs(action_dir, exist_ok=True)
            os.makedirs(episode_dir, exist_ok=True)
            os.makedirs(logstd_dir, exist_ok=True)
            act_savepath = osp.join(action_dir, 'actions_%.5i.pkl'%update)
            epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl'%update)
            std_savepath = osp.join(logstd_dir, 'logstd_%.5i.pkl'%update)
            with open(act_savepath, 'wb') as fh:
                pickle.dump(actions, fh)
            with open(epi_savepath, 'wb') as fh:
                pickle.dump(ep_all_infos, fh)
            with open(std_savepath, 'wb') as fh:
                pickle.dump(logstd_vals, fh)
            # ------------------------------------------------------------------
    return model
예제 #8
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    total_timesteps = int(total_timesteps)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    if isinstance(network, str):
        network_type = network
        policy_network_fn = get_network_builder(network_type)(**network_kwargs)
        policy_network = policy_network_fn(ob_space.shape)

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(ac_space=ac_space,
                     policy_network=policy_network,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     lr=lr)
    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=model)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        ckpt.restore(manager.latest_checkpoint)
        print("Restoring from {}".format(manager.latest_checkpoint))
        print('after restore, all trainable weights {}'.format(
            model.train_model.policy_network.trainable_weights))
        #model.load_weights(load_path)

    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        #lrnow = lr(frac)
        lrnow = lr
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (tf.constant(arr[mbinds])
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    # slice_obs, slice_returns, slice_masks, slice_actions, slice_values, slice_neglogpacs = (arr[mbinds] for arr in  (obs, returns, masks, actions, values, neglogpacs))
                    # slice_advs = slice_returns - slice_values
                    # slice_advs = (slice_advs - slice_advs.mean()) / (slice_advs.std() + 1e-8)
                    # slices = (tf.constant(slice_obs), tf.constant(slice_returns), tf.constant(slice_masks),
                    #     tf.constant(slice_advs), tf.constant(slice_actions), tf.constant(slice_values), tf.constant(slice_neglogpacs))
                    # print('slice actions {}'.format(slice_actions.dtype))
                    # print('-------------------------------------------')
                    # print('inds {}'.format(inds))
                    # print('slice obs {}'.format(slice_obs))
                    # print('slice returns {}'.format(slice_returns))
                    # print('slice masks {}'.format(slice_masks))
                    # print('slice actions {}'.format(slice_actions))
                    # print('slice values {}'.format(slice_values))
                    # print('slice neglogpacs {}'.format(slice_neglogpacs))
                    # print('slice advs {}'.format(slice_advs))
                    pg_loss, vf_loss, entropy, approxkl, clipfrac, vpred, vpredclipped = model.train(
                        lrnow, cliprange, *slices)
                    # pg_loss, vf_loss, entropy, approxkl, clipfrac, vpred, vpredclipped = model.train(
                    #     cliprange, obs=slice_obs, returns=slice_returns, masks=slice_masks, advs=slice_advs,
                    #     actions=slice_actions, values=slice_values, neglogpac_old=slice_neglogpacs)
                    # print('pg_loss {}'.format(pg_loss))
                    # print('vf_loss {}'.format(vf_loss))
                    # print('entropy {}'.format(entropy))
                    # print('approxkl {}'.format(approxkl))
                    # print('clipfrac {}'.format(clipfrac))
                    # print('vpred {}'.format(vpred))
                    # print('vpredclipped {}'.format(vpredclipped))
                    # print('pg_loss1 {}'.format(pg_loss1))
                    # print('pg_loss2 {}'.format(pg_loss2))
                    # train_model = model.train_model
                    # params = train_model.policy_network.trainable_weights + train_model.value_fc.trainable_weights + train_model.pdtype.matching_fc.trainable_weights
                    # for param in params:
                    #     print('param {} is {}'.format(param.name, param.numpy()))
                    # print('-------------------------------------------')
                    mblossvals.append([
                        pg_loss.numpy(),
                        vf_loss.numpy(),
                        entropy.numpy(),
                        approxkl.numpy(),
                        clipfrac.numpy()
                    ])
                    # mblossvals.append([output for output.numpy() in model.train(cliprange, *slices)])
        else:  # recurrent version
            raise ValueError('Not Support Yet')

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
    return model
예제 #9
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=1,
          noptepochs=4,
          cliprange=0.2,
          save_interval=1000,
          load_path=None,
          model_fn=None,
          **network_kwargs):
    '''
	Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

	Parameters:
	----------

	network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
									  specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
									  tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
									  neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
									  See common/models.py/lstm for more details on using recurrent nets in policies

	env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
									  The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


	nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
									  nenv is number of environment copies simulated in parallel)

	total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

	ent_coef: float                   policy entropy coefficient in the optimization objective

	lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
									  training and 0 is the end of the training.

	vf_coef: float                    value function loss coefficient in the optimization objective

	max_grad_norm: float or None      gradient norm clipping coefficient

	gamma: float                      discounting factor

	lam: float                        advantage estimation discounting factor (lambda in the paper)

	log_interval: int                 number of timesteps between logging events

	nminibatches: int                 number of training minibatches per update. For recurrent policies,
									  should be smaller or equal than number of environments run in parallel.

	noptepochs: int                   number of training epochs per update

	cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
									  and 0 is the end of the training

	save_interval: int                number of timesteps between saving events

	load_path: str                    path to load the model from

	**network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
									  For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



	'''

    nsteps = env.args.nsteps

    set_global_seeds(seed)

    if isinstance(lr, float):
        lr = constfn(lr)
    else:
        assert callable(lr)
    if isinstance(cliprange, float):
        cliprange = constfn(cliprange)
    else:
        assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Get the nb of env
    nenvs = env.num_envs

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    models = []
    policy = []

    for agent_i in range(env.spec):
        policy.append(build_policy(env, network, **network_kwargs))

        # Instantiate the model object (that creates act_model and train_model)
        if model_fn is None:
            from baselines.ppo2.model import Model
            model_fn = Model

        model = model_fn(policy=policy[agent_i],
                         ob_space=ob_space,
                         ac_space=ac_space,
                         nbatch_act=nenvs,
                         nbatch_train=nbatch_train,
                         nsteps=nsteps,
                         ent_coef=ent_coef,
                         vf_coef=vf_coef,
                         max_grad_norm=max_grad_norm,
                         agent_index=agent_i)

        if load_path is not None:
            model.load(load_path + ('checkpoints-%i/' % agent_i) +
                       env.args.s_load_num)
            print('successfully load agent-%d' % agent_i)
        # Instantiate the runner object

        models.append(model)

    # ###
    runner = Runner(env=env,
                    model_n=models,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model_n=models,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    all_rewards_buf_0 = deque(maxlen=100)
    all_rewards_buf_1 = deque(maxlen=100)
    d_rewards_buf_0 = deque(maxlen=100)
    ce_rewards_buf_0 = deque(maxlen=100)
    c_rewards_buf_show_0 = deque(maxlen=100)
    c_rewards_buf_t_0 = deque(maxlen=100)
    c_rewards_buf_r_0 = deque(maxlen=100)
    c_rewards_buf_v_0 = deque(maxlen=100)
    ext_rewards_buf_v_0 = deque(maxlen=100)
    int_rewards_buf_v_0 = deque(maxlen=100)
    c_rewards_buf_tv_0 = deque(maxlen=100)
    ext_rewards_buf_tv_0 = deque(maxlen=100)
    int_rewards_buf_tv_0 = deque(maxlen=100)
    c_rewards_buf_all_0 = deque(maxlen=100)
    p_rewards_buf = deque(maxlen=100)
    d_rewards_buf_1 = deque(maxlen=100)
    ce_rewards_buf_1 = deque(maxlen=100)
    c_rewards_buf_show_1 = deque(maxlen=100)
    c_rewards_buf_t_1 = deque(maxlen=100)
    c_rewards_buf_r_1 = deque(maxlen=100)
    c_rewards_buf_v_1 = deque(maxlen=100)
    ext_rewards_buf_v_1 = deque(maxlen=100)
    int_rewards_buf_v_1 = deque(maxlen=100)
    c_rewards_buf_tv_1 = deque(maxlen=100)
    ext_rewards_buf_tv_1 = deque(maxlen=100)
    int_rewards_buf_tv_1 = deque(maxlen=100)
    c_rewards_buf_all_1 = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    print(total_timesteps)
    print(nupdates)
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        # Get minibatch
        obs_n, returns_n, masks_n, actions_n, values_n, neglogpacs_n, e_returns_n, e_values_n, e_neglogpacs_n, \
        c_returns_n, c_values_n, c_neglogpacs_n, \
        states_n, e_states_n, c_states_n, epinfos, all_rewards, d_rewards, c_rewards_show, c_rewards_t, c_rewards_r, \
        c_rewards_v, c_rewards_tv, c_rewards_all, p_rewards, ce_rewards, \
        ext_rewards_tv_n, int_rewards_tv_n, ext_rewards_v_n, int_rewards_v_n = runner.run()  # pylint: disable=E0632

        if eval_env is not None:
            eval_obs_n, eval_returns_n, eval_masks_n, eval_actions_n, eval_values_n, eval_neglogpacs_n, eval_states_n, \
            eval_epinfos = eval_runner.run()  # pylint: disable=E0632

        num_env = p_rewards.shape[1]
        epinfobuf.append(1. * np.sum(epinfos) / (np.sum(masks_n[0]) + num_env))
        all_rewards_buf_0.append(1. * np.sum(all_rewards[:, 0]) /
                                 (np.sum(masks_n[0]) + num_env))
        d_rewards_buf_0.append(1. * np.sum(d_rewards[:, 0]) /
                               (np.sum(masks_n[0]) + num_env))
        ce_rewards_buf_0.append(1. * np.sum(ce_rewards[:, 0]) /
                                (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_show_0.append(1. * np.sum(c_rewards_show[:, 0]) /
                                    (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_r_0.append(1. * np.sum(c_rewards_r[:, 0]) /
                                 (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_v_0.append(1. * np.sum(c_rewards_v[:, 0]) /
                                 (np.sum(masks_n[0]) + num_env))
        ext_rewards_buf_v_0.append(1. * np.sum(ext_rewards_v_n[:, 0]) /
                                   (np.sum(masks_n[0]) + num_env))
        int_rewards_buf_v_0.append(1. * np.sum(int_rewards_v_n[:, 0]) /
                                   (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_t_0.append(1. * np.sum(c_rewards_t[:, 0]) /
                                 (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_tv_0.append(1. * np.sum(c_rewards_tv[:, 0]) /
                                  (np.sum(masks_n[0]) + num_env))
        ext_rewards_buf_tv_0.append(1. * np.sum(ext_rewards_tv_n[:, 0]) /
                                    (np.sum(masks_n[0]) + num_env))
        int_rewards_buf_tv_0.append(1. * np.sum(int_rewards_tv_n[:, 0]) /
                                    (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_all_0.append(1. * np.sum(c_rewards_all[:, 0]) /
                                   (np.sum(masks_n[0]) + num_env))
        all_rewards_buf_1.append(1. * np.sum(all_rewards[:, 1]) /
                                 (np.sum(masks_n[0]) + num_env))
        d_rewards_buf_1.append(1. * np.sum(d_rewards[:, 1]) /
                               (np.sum(masks_n[0]) + num_env))
        ce_rewards_buf_1.append(1. * np.sum(ce_rewards[:, 1]) /
                                (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_show_1.append(1. * np.sum(c_rewards_show[:, 1]) /
                                    (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_r_1.append(1. * np.sum(c_rewards_r[:, 1]) /
                                 (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_v_1.append(1. * np.sum(c_rewards_v[:, 1]) /
                                 (np.sum(masks_n[0]) + num_env))
        ext_rewards_buf_v_1.append(1. * np.sum(ext_rewards_v_n[:, 1]) /
                                   (np.sum(masks_n[0]) + num_env))
        int_rewards_buf_v_1.append(1. * np.sum(int_rewards_v_n[:, 1]) /
                                   (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_t_1.append(1. * np.sum(c_rewards_t[:, 1]) /
                                 (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_tv_1.append(1. * np.sum(c_rewards_tv[:, 1]) /
                                  (np.sum(masks_n[0]) + num_env))
        ext_rewards_buf_tv_1.append(1. * np.sum(ext_rewards_tv_n[:, 1]) /
                                    (np.sum(masks_n[0]) + num_env))
        int_rewards_buf_tv_1.append(1. * np.sum(int_rewards_tv_n[:, 1]) /
                                    (np.sum(masks_n[0]) + num_env))
        c_rewards_buf_all_1.append(1. * np.sum(c_rewards_all[:, 1]) /
                                   (np.sum(masks_n[0]) + num_env))

        p_rewards_buf.append(-1. * np.sum(p_rewards) /
                             (np.sum(masks_n[0]) + num_env))

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals_n = [[] for _ in range(env.spec)]
        mb_e_lossvals_n = [[] for _ in range(env.spec)]
        mb_c_lossvals_n = [[] for _ in range(env.spec)]

        if states_n[0] is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            for agent_i in range(env.spec):
                inds = np.arange(nbatch)
                for _ in range(noptepochs):
                    # Randomize the indexes
                    np.random.shuffle(inds)
                    # 0 to batch_size with batch_train_size step
                    for start in range(0, nbatch, nbatch_train):
                        end = start + nbatch_train
                        mbinds = inds[start:end]

                        # ########### TRAIN MODEL
                        slices = (arr[mbinds]
                                  for arr in (obs_n[agent_i],
                                              returns_n[agent_i],
                                              masks_n[agent_i],
                                              actions_n[agent_i],
                                              values_n[agent_i],
                                              neglogpacs_n[agent_i]))
                        mblossvals_n[agent_i].append(models[agent_i].train(
                            lrnow, cliprangenow, *slices))

                        # ########## TRAIN E_MODEL
                        e_slices = (arr[mbinds]
                                    for arr in (obs_n[agent_i],
                                                e_returns_n[agent_i],
                                                masks_n[agent_i],
                                                actions_n[agent_i],
                                                e_values_n[agent_i],
                                                e_neglogpacs_n[agent_i]))
                        if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \
                          env.args.s_alg_name == 'dec':
                            mb_e_lossvals_n[agent_i].append(0)
                        else:
                            mb_e_lossvals_n[agent_i].append(
                                models[agent_i].e_train(
                                    lrnow, cliprangenow, *e_slices))

                        # ########## TRAIN C_MODEL
                        c_slices = (arr[mbinds]
                                    for arr in (obs_n[agent_i],
                                                c_returns_n[agent_i],
                                                masks_n[agent_i],
                                                actions_n[agent_i],
                                                c_values_n[agent_i],
                                                c_neglogpacs_n[agent_i]))
                        if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \
                          env.args.s_alg_name == 'dec':
                            mb_c_lossvals_n[agent_i].append(0)
                        else:
                            mb_c_lossvals_n[agent_i].append(
                                models[agent_i].c_train(
                                    lrnow, cliprangenow, *c_slices))

        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches

            for agent_i in range(env.spec):
                envinds = np.arange(nenvs)
                flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
                for _ in range(noptepochs):
                    np.random.shuffle(envinds)
                    for start in range(0, nenvs, envsperbatch):
                        end = start + envsperbatch
                        mbenvinds = envinds[start:end]
                        mbflatinds = flatinds[mbenvinds].ravel()
                        slices = (arr[mbflatinds]
                                  for arr in (obs_n[agent_i],
                                              returns_n[agent_i],
                                              masks_n[agent_i],
                                              actions_n[agent_i],
                                              values_n[agent_i],
                                              neglogpacs_n[agent_i]))
                        mbstates = states_n[agent_i][mbenvinds]
                        mblossvals_n[agent_i].append(models[agent_i].train(
                            lrnow, cliprangenow, *slices, mbstates))

                        e_slices = (arr[mbflatinds]
                                    for arr in (obs_n[agent_i],
                                                e_returns_n[agent_i],
                                                masks_n[agent_i],
                                                actions_n[agent_i],
                                                e_values_n[agent_i],
                                                e_neglogpacs_n[agent_i]))
                        e_mbstates = e_states_n[agent_i][mbenvinds]
                        if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \
                          env.args.s_alg_name == 'dec':
                            mb_e_lossvals_n[agent_i].append(0)
                        else:
                            mb_e_lossvals_n[agent_i].append(
                                models[agent_i].e_train(
                                    lrnow, cliprangenow, *e_slices,
                                    e_mbstates))

                        c_slices = (arr[mbflatinds]
                                    for arr in (obs_n[agent_i],
                                                c_returns_n[agent_i],
                                                masks_n[agent_i],
                                                actions_n[agent_i],
                                                c_values_n[agent_i],
                                                c_neglogpacs_n[agent_i]))
                        c_mbstates = c_states_n[agent_i][mbenvinds]
                        if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \
                          env.args.s_alg_name == 'dec':
                            mb_c_lossvals_n[agent_i].append(0)
                        else:
                            mb_c_lossvals_n[agent_i].append(
                                models[agent_i].c_train(
                                    lrnow, cliprangenow, *c_slices,
                                    c_mbstates))

        # Feedforward --> get losses --> update
        lossvals_n = [
            np.mean(mblossvals_n[agent_i], axis=0)
            for agent_i in range(env.spec)
        ]
        e_lossvals_n = [
            np.mean(mb_e_lossvals_n[agent_i], axis=0)
            for agent_i in range(env.spec)
        ]
        c_lossvals_n = [
            np.mean(mb_c_lossvals_n[agent_i], axis=0)
            for agent_i in range(env.spec)
        ]

        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv('time_elapsed', tnow - tfirststart)
            logger.logkv('eprewmean',
                         safemean([epinfo for epinfo in epinfobuf]))
            logger.logkv(
                'ep_all_rewmean_0',
                safemean([all_rewards for all_rewards in all_rewards_buf_0]))
            logger.logkv(
                'ep_all_rewmean_1',
                safemean([all_rewards for all_rewards in all_rewards_buf_1]))
            logger.logkv(
                'ep_dec_rewmean_0',
                safemean([all_rewards for all_rewards in d_rewards_buf_0]))
            logger.logkv(
                'ep_cen_rewmean_0',
                safemean([all_rewards for all_rewards in ce_rewards_buf_0]))
            logger.logkv(
                'ep_coor_rewmean_show_0',
                safemean([all_rewards
                          for all_rewards in c_rewards_buf_show_0]))
            logger.logkv(
                'ep_coor_rewmean_r_0',
                safemean([all_rewards for all_rewards in c_rewards_buf_r_0]))
            logger.logkv(
                'ep_coor_rewmean_v_0',
                safemean([all_rewards for all_rewards in c_rewards_buf_v_0]))
            logger.logkv(
                'ep_coor_rewmean_v_ext_0',
                safemean([all_rewards for all_rewards in ext_rewards_buf_v_0]))
            logger.logkv(
                'ep_coor_rewmean_v_int_0',
                safemean([all_rewards for all_rewards in int_rewards_buf_v_0]))
            logger.logkv(
                'ep_coor_rewmean_t_0',
                safemean([all_rewards for all_rewards in c_rewards_buf_t_0]))
            logger.logkv(
                'ep_coor_rewmean_tv_0',
                safemean([all_rewards for all_rewards in c_rewards_buf_tv_0]))
            logger.logkv(
                'ep_coor_rewmean_tv_ext_0',
                safemean([all_rewards
                          for all_rewards in ext_rewards_buf_tv_0]))
            logger.logkv(
                'ep_coor_rewmean_tv_int_0',
                safemean([all_rewards
                          for all_rewards in int_rewards_buf_tv_0]))
            logger.logkv(
                'ep_coor_rewmean_all_0',
                safemean([all_rewards for all_rewards in c_rewards_buf_all_0]))
            logger.logkv(
                'ep_dec_rewmean_1',
                safemean([all_rewards for all_rewards in d_rewards_buf_1]))
            logger.logkv(
                'ep_cen_rewmean_1',
                safemean([all_rewards for all_rewards in ce_rewards_buf_1]))
            logger.logkv(
                'ep_coor_rewmean_show_1',
                safemean([all_rewards
                          for all_rewards in c_rewards_buf_show_1]))
            logger.logkv(
                'ep_coor_rewmean_r_1',
                safemean([all_rewards for all_rewards in c_rewards_buf_r_1]))
            logger.logkv(
                'ep_coor_rewmean_v_1',
                safemean([all_rewards for all_rewards in c_rewards_buf_v_1]))
            logger.logkv(
                'ep_coor_rewmean_v_ext_1',
                safemean([all_rewards for all_rewards in ext_rewards_buf_v_1]))
            logger.logkv(
                'ep_coor_rewmean_v_int_1',
                safemean([all_rewards for all_rewards in int_rewards_buf_v_1]))
            logger.logkv(
                'ep_coor_rewmean_t_1',
                safemean([all_rewards for all_rewards in c_rewards_buf_t_1]))
            logger.logkv(
                'ep_coor_rewmean_tv_1',
                safemean([all_rewards for all_rewards in c_rewards_buf_tv_1]))
            logger.logkv(
                'ep_coor_rewmean_tv_ext_1',
                safemean([all_rewards
                          for all_rewards in ext_rewards_buf_tv_1]))
            logger.logkv(
                'ep_coor_rewmean_tv_int_1',
                safemean([all_rewards
                          for all_rewards in int_rewards_buf_tv_1]))
            logger.logkv(
                'ep_coor_rewmean_all_1',
                safemean([all_rewards for all_rewards in c_rewards_buf_all_1]))
            logger.logkv(
                'ep_penalty_rewmean',
                safemean([all_rewards for all_rewards in p_rewards_buf]))

            # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))

            for agent_i in range(env.spec):
                ev = explained_variance(values_n[agent_i], returns_n[agent_i])
                logger.logkv("explained_variance-%i" % agent_i, float(ev))

                if eval_env is not None:
                    logger.logkv(
                        'eval_eprewmean',
                        safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                    logger.logkv(
                        'eval_eplenmean',
                        safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
                for (lossval, lossname) in zip(lossvals_n[agent_i],
                                               models[agent_i].loss_names):
                    logger.logkv(lossname + ('-%i' % agent_i), lossval)

                if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \
                  env.args.s_alg_name == 'dec':
                    pass
                else:
                    for (lossval, lossname) in zip(e_lossvals_n[agent_i],
                                                   models[agent_i].loss_names):
                        logger.logkv(lossname + ('-e-%i' % agent_i), lossval)
                    for (lossval, lossname) in zip(c_lossvals_n[agent_i],
                                                   models[agent_i].loss_names):
                        logger.logkv(lossname + ('-c-%i' % agent_i), lossval)

                if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                    logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            for i_m, m in enumerate(models):
                checkdir = osp.join(logger.get_dir(), 'checkpoints-%i' % i_m)
                os.makedirs(checkdir, exist_ok=True)
                savepath = osp.join(checkdir, '%.5i' % update)
                print('Saving to', savepath)
                m.save(savepath)
    return models
예제 #10
0
def test(network,
         test_env: VecEnv,
         n_steps: int = 2048,
         ent_coef: float = 0.,
         vf_coef: float = .5,
         max_grad_norm: float = .5,
         gamma: float = .99,
         lmbda: float = .95,
         n_minibatches: int = 1,
         load_path: str = None,
         model_fn=None,
         mpi_rank_weight: int = 1,
         comm=None,
         **network_kwargs):

    # Load models
    policy = build_policy(test_env, network, **network_kwargs)

    # Get the nb of env
    nenvs = test_env.num_envs

    # Get state_space and action_space
    ob_space = test_env.observation_space
    ac_space = test_env.action_space

    # Calculate the batch_size
    nbatch = nenvs * n_steps
    nbatch_train = nbatch // n_minibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        #model_fn = Model
        model_fn = ADRModel

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=n_steps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    model.load(load_path)
    runner = Runner(env=test_env,
                    model=model,
                    nsteps=n_steps,
                    gamma=gamma,
                    lam=lmbda)
    epinfobuf = deque(maxlen=100)

    obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
    )
    epinfobuf.extend(epinfos)

    #get reward stats
    eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf])
    eprewstd = safestd([epinfo['r'] for epinfo in epinfobuf])
    logger.logkv('eprewmean', eprewmean)
    logger.logkv('eprewstd', eprewstd)
    return eprewmean, eprewstd
예제 #11
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    total_timesteps = 1_000_000  ## now this counts steps in testing runs
    use_vf_clipping = True

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5
    L2_WEIGHT = 10e-4
    FM_COEFF = 0.002
    REAL_THRES = 0.1

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    ## default starting_level set to 50 to test on unseen levels!
    parser.add_argument('--start_level', type=int, default=1000)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_rank_weight = 0
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    ## Modified based on random_ppo.learn
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
    policy = build_policy(env, network)
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id)
    model.load(LOAD_PATH)
    logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    # tfirststart = time.time() ## Not doing timing yet
    # active_ep_buf = epinfobuf100

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  ## differnent from random_ppo!
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
예제 #12
0
def learn(network,
          env,
          nsteps,
          total_timesteps,
          mvs,
          ckpt,
          seed=None,
          ent_coef=0.0,
          lr=1e-3,
          vf_coef=0.5,
          max_grad_norm=0.5,
          noptepochs=4,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          cliprange=0.2,
          save_interval=1,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          load_path=None,
          **network_kwargs):

    set_global_seeds(seed)
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    # build policy

    policy = build_policy(env, network, **network_kwargs)

    # Calculate the batch_size

    nenvs = env.num_envs
    nminibatches = 1
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)

    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(
        policy=policy,
        nbatch_act=nenvs,
        nbatch_train=None,  #nbatch_train,
        nsteps=nsteps,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        comm=comm,
        mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)
        print('Model has been successfully loaded from {0}'.format(load_path))
    else:
        try:
            lp = osp.join(logger.get_dir(), 'checkpoints/{0}'.format(ckpt))
            model.load(lp)
            print('Model has been successfully loaded from {0}'.format(lp))
        except Exception as e:
            print(e)

    # Instantiate the runner object and episode buffer

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    mvs=mvs)
    epinfobuf = deque(maxlen=log_interval * nenvs)
    best_reward = -np.inf

    if init_fn is not None:
        init_fn()

    # Start total timer

    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates  # decreases from 1 to 0
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch

        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.

        mblossvals = []
        if states is None:  # nonrecurrent version

            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))

        else:  # recurrent version

            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    # print(states.shape, mbstates.shape)
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.perf_counter()
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("misc/fps", fps)

            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('stats/eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('stats/eprewmin',
                         np.min([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('stats/eprewmax',
                         np.max([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('stats/eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('misc/' + lossname, lossval)
            logger.dumpkvs()

        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, 'last')
            print('Saving to', savepath)
            model.save(savepath)
            if len(epinfobuf) == log_interval * nenvs and safemean(
                [epinfo['r'] for epinfo in epinfobuf]) > best_reward:
                savepath = osp.join(checkdir, 'best')
                print('Saving to', savepath)
                model.save(savepath)
                best_reward = safemean([epinfo['r'] for epinfo in epinfobuf])

    model.sess.close()
    return model
예제 #13
0
    def __init__(self,
                 *,
                 network,
                 env,
                 lr=3e-4,
                 cliprange=0.2,
                 nsteps=128,
                 nminibatches=4,
                 noptepochs=4,
                 ent_coef=0.0,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 gamma=0.99,
                 lam=0.95,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 load_path=None,
                 **network_kwargs):
        """
        Parameters:
        ----------

        network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                          specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                          tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                          neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                          See common/models.py/lstm for more details on using recurrent nets in policies.py

        env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                          The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


        lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                          training and 0 is the end of the training.

        cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                          and 0 is the end of the training

        nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                          nenv is number of environment copies simulated in parallel)


        nminibatches: int                 number of training minibatches per update. For recurrent policies.py,
                                          should be smaller or equal than number of environments run in parallel.

        noptepochs: int                   number of training epochs per update

        ent_coef: float                   policy entropy coefficient in the optimization objective

        vf_coef: float                    value function loss coefficient in the optimization objective

        gamma: float                      discounting factor

        lam: float                        advantage estimation discounting factor (lambda in the paper)

        log_interval: int                 number of timesteps between logging events

        load_path: str                    path to load the model from

        **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network
                                          For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

        """

        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        policy = build_policy(env, network, **network_kwargs)

        self.env = env

        if isinstance(lr, float):
            self.lr = constfn(lr)
        else:
            assert callable(lr)
        if isinstance(cliprange, float):
            self.cliprange = constfn(cliprange)
        else:
            assert callable(cliprange)
        self.nminibatches = nminibatches

        # if eval_env is not None:
        #     eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

        # Calculate the batch_size
        self.nenvs = self.env.num_envs
        self.nsteps = nsteps
        self.nbatch = self.nenvs * self.nsteps
        self.nbatch_train = self.nbatch // nminibatches
        self.noptepochs = noptepochs

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(self.nenvs, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(self.nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder(
            [None])  # action placeholder
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0),
                                   CLIPRANGE)))  # ratio 裁剪量

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS

        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.def_path_pre = os.path.dirname(
            os.path.abspath(__file__)) + '/tmp/'

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  # pylint: disable=E1101

        if load_path is not None:
            self.load_newest(load_path)

        # Instantiate the runner object
        self.runner = Runner(env=self.env,
                             model=self,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)
예제 #14
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    ##new defined
    vf_coef = 0.5
    max_grad_norm = 0.5
    ###########
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    # timesteps_per_proc = 50_000_000
    use_vf_clipping = True

    parser = argparse.ArgumentParser(description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='coinrun')
    parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=0)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--total_timesteps', type=int, default=0)

    args = parser.parse_args()

    test_worker_interval = args.test_worker_interval

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    is_test_worker = False

    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1)

    mpi_rank_weight = 0 if is_test_worker else 1
    num_levels = 0 if is_test_worker else args.num_levels

    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(dir=LOG_DIR, 
                     format_strs=format_strs,
                     log_suffix="_total_timesteps_{}_num_levels_{}".format(args.total_timesteps,
                                                                           num_levels))

    '''logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv, filename=None, keep_buf=100,
    )

    venv = VecNormalize(venv=venv, ob=False)'''

    logger.info("Creating dropout evaluation environment")
    eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode)
    eval_venv = VecExtractDictObs(eval_venv, "rgb")

    eval_venv = VecMonitor(
        venv=eval_venv, filename=None, keep_buf=100,
    )

    eval_venv = VecNormalize(venv=eval_venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    conv_fn = lambda x: build_impala_cnn(x, is_train=False, depths=[16,32,32], emb_size=256)

    logger.info("testing dropout")
    

    
    policy = build_policy(eval_venv,conv_fn)

    nenvs = eval_venv.num_envs
    ob_space = eval_venv.observation_space
    ac_space = eval_venv.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch//nminibatches
    
    # Instantiate the model object (that creates act_model and train_model)
    
    from baselines.ppo2.model import Model
    model_fn = Model    #modified from baseline ppo2 learn

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)
    model.load(MODEL_PATH)
    eval_runner = Runner(env=eval_venv, model=model, nsteps=nsteps, gamma=.999, lam=.95)

    eval_epinfobuf = deque(maxlen=100)
    nupdates = args.total_timesteps//nbatch

    log_interval = 1
    for update in range(1, nupdates+1):
    #single upate to test    
        eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run()
        eval_epinfobuf.extend(eval_epinfos)
        if update % log_interval == 0 or update == 1:
            logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
            logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('misc/total_timesteps',update*nbatch)
            logger.dumpkvs()
    eval_venv.close()
예제 #15
0
def learn_setup(*,
                network=None,
                env=None,
                total_timesteps=None,
                eval_env=None,
                seed=None,
                nsteps=None,
                ent_coef=0.0,
                lr=3e-4,
                reward_scale=None,
                exp_name=None,
                load_file=None,
                vf_coef=0.5,
                max_grad_norm=0.5,
                gamma=0.99,
                lam=0.95,
                log_interval=10,
                nminibatches=4,
                noptepochs=4,
                cliprange=0.2,
                n_steps_per_episode=None,
                n_episodes=1,
                nupdates=1,
                batch_size=None,
                save_interval=0,
                load_path=None,
                model_fn=None,
                **network_kwargs):

    lr = 10**(-1 * lr)
    vf_coef = 10**(-1 * vf_coef)
    seed = int(seed)
    ent_coef = 10**(-1 * ent_coef)

    if network == "lstm":
        nminibatches = 1
    if nsteps is None:
        nsteps = n_steps_per_episode * n_episodes

    #set_global_seeds(seed)
    #np.random.seed(None)
    #np.random.seed(seed)
    if nsteps is None:
        nsteps = n_steps_per_episode
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    policy = build_policy(env, network, **network_kwargs)
    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nupdates = total_timesteps // nbatch

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model
    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm)
    if load_file is not None:
        model.load("models/" + load_file)
    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    eval_runner = None

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.time()

    local_variables = {
        'nbatch': nbatch,
        'nminibatches': nminibatches,
        'nbatch_train': nbatch_train,
        'save_interval': save_interval,
        'model': model,
        'runner': runner,
        'lr': lr,
        'exp_name': exp_name,
        'nsteps': nsteps,
        'nenvs': nenvs,
        'log_interval': log_interval,
        'cliprange': cliprange,
        'eval_runner': eval_runner,
        'n_episodes': n_episodes,
        'eval_env': eval_env,
        'epinfobuf': epinfobuf,
        'noptepochs': noptepochs,
        'tfirststart': tfirststart,
        'nupdates': nupdates
    }
    return local_variables
예제 #16
0
파일: ppo2.py 프로젝트: MrGoogol/baselines
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps//nbatch
    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
예제 #17
0
def learn(*, policy, FLAGS, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, 
            average_window_size=int(1e6), stop=True,
            scenario='gfootball.scenarios.1_vs_1_easy',
            curriculum=np.linspace(0, 0.95, 20),
            **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)
    Parameters:
    ----------
    policy:                             policy network (as returned by build_policy())
    #<REMOVED >network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
    #                                  specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
    #                                  tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
    #                                  neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
    #                                  See common/models.py/lstm for more details on using recurrent nets in policies
    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.
    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)
    ent_coef: float                   policy entropy coefficient in the optimization objective
    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.
    vf_coef: float                    value function loss coefficient in the optimization objective
    max_grad_norm: float or None      gradient norm clipping coefficient
    gamma: float                      discounting factor
    lam: float                        advantage estimation discounting factor (lambda in the paper)
    log_interval: int                 number of timesteps between logging events
    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.
    noptepochs: int                   number of training epochs per update
    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training
    save_interval: int                number of timesteps between saving events
    load_path: str                    path to load the model from
    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    basic_builder = importlib.import_module(scenario, package=None)
    def build_builder_with_difficulty(difficulty):
        def builder_with_difficulty(builder):
            basic_builder.build_scenario(builder)
            builder.config().right_team_difficulty = difficulty
            builder.config().left_team_difficulty = difficulty
        return builder_with_difficulty

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)

    def make_runner(difficulty):
        def create_single_football_env(iprocess):
            """Creates gfootball environment."""
            env = football_env.create_environment(
                env_name=builder_with_difficulty(difficulty), stacked=('stacked' in FLAGS.state),
                rewards=FLAGS.reward_experiment,
                logdir=logger.get_dir(),
                write_goal_dumps=FLAGS.dump_scores and (iprocess == 0),
                write_full_episode_dumps=FLAGS.dump_full_episodes and (iprocess == 0),
                render=FLAGS.render and (iprocess == 0),
                dump_frequency=50 if FLAGS.render and iprocess == 0 else 0)
            env = monitor.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(),
                                                                        str(iprocess)))
            return env

        env = football_env.create_environment(, 
            stacked=False, logdir='/tmp/football', write_goal_dumps=True, 
            write_full_episode_dumps=False, render=False)

        vec_env = SubprocVecEnv([
            (lambda _i=i: create_single_football_env(_i))
            for i in range(FLAGS.num_envs)
        ], context=None)
        return Runner(env=vec_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) 
    # Instantiate the runner object
    runner = make_runner(difficulties[0])
    difficulty_idx = 0
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)
 
    eprews = []
    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps//nbatch
    update = 0
    while True:
        update += 1
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632

        for r1, r2 in zip(returns, [i['r'] for i in epinfos]):
          assert r1 == r2 # assuming returns[i] and epinfos[i]['r'] are the saem

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        eprews.extend(returns)
        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update*nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            print('Saving to', savepath)
            model.save(savepath)
        if difficulty_idx < len(difficulties)-1 and len(eprews) >= average_window_size and sum(eprews[-average_window_size:]) >= 0.
            difficulty_idx += 1
            env = football_env.create_environment(env_name=builder_with_difficulty(difficulty), 
                stacked=False, logdir='/tmp/football', write_goal_dumps=True, 
                write_full_episode_dumps=False, render=False)
            runner = Runner(env=make_env(difficulties[difficulty_idx]), model=model, nsteps=nsteps, gamma=gamma, lam=lam)
            if eval_env is not None:
                eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)
    return model
예제 #18
0
def main7():
    retro.data.add_custom_integration("custom")

    def wrap_deepmind_n64(env, reward_scale=1 / 100.0, frame_stack=1, grayscale=False):
        env = MaxAndSkipEnv(env, skip=4)
        env = WarpFrame(env, width=150, height=100, grayscale=grayscale)
        env = FrameStack(env, frame_stack)
        env = ScaledFloatFrame(env)
        env = RewardScaler(env, scale=1 / 100.0)
        return env

    def make_env():
        retro.data.add_custom_integration("custom")
        env = retro.n64_env.N64Env(game="SuperSmashBros-N64",
                                   use_restricted_actions=retro.Actions.MULTI_DISCRETE,
                                   inttype=retro.data.Integrations.CUSTOM,
                                   obs_type=retro.Observations.IMAGE)
        env = wrap_deepmind_n64(env)
        return env

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    nenvs = 2
    # env = DummyVecEnv([make_env] * nenvs)
    env = SubprocVecEnv([make_env] * nenvs)
    network_name = "impala_cnn_lstm"
    policy = build_policy(env, network_name)
    recurrent = "lstm" in network_name
    ob_space = env.observation_space
    ac_space = env.action_space
    nsteps = 10
    nminibatches = 2
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=0.01,
                  vf_coef=0.5,
                  max_grad_norm=0.5,
                  comm=None,
                  mpi_rank_weight=1)
    runner = Runner(env=env, model=model, nsteps=10, gamma=.99, lam=.95)

    env.reset()
    num_steps = 20000
    action = [np.array([0, 0, 0]), np.array([0, 0, 0])]
    for i in range(num_steps):
        sys.stdout.write(f"\r{i+1} / {num_steps}")
        action = [env.action_space.sample() for _ in range(nenvs)]
        obs, reward, dones, info = env.step(action)
        # env.reset(dones)
        # env.render()

        if i % 50 == 0:
            if recurrent:
                fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 12))
            else:
                fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(20, 12))
            for env_index in range(nenvs):
                if recurrent:
                    axs[env_index].imshow(obs[env_index, :, :, :])
                else:
                    for j in range(4):
                        row = env_index * 2 + j // 2
                        col = j % 2
                        print(row)
                        print(col)
                        axs[row, col].imshow(obs[env_index, :, :, j])
            plt.show()
            plt.close()
    end = time.time()
    print(end - start)

    return env
예제 #19
0
def learn(*,
          network,
          env,
          total_timesteps,
          early_stopping=False,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          scope='',
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''
    additional_params = network_kwargs["network_kwargs"]
    from baselines import logger

    # set_global_seeds(seed) We deal with seeds upstream

    if "LR_ANNEALING" in additional_params.keys():
        lr_reduction_factor = additional_params["LR_ANNEALING"]
        start_lr = lr
        lr = lambda prop: (start_lr / lr_reduction_factor) + (
            start_lr - (start_lr / lr_reduction_factor
                        )) * prop  # Anneals linearly from lr to lr/red factor

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    bestrew = 0
    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     scope=scope)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    best_rew_per_step = 0

    run_info = defaultdict(list)
    nupdates = total_timesteps // nbatch
    print("TOT NUM UPDATES", nupdates)
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0, "Have {} total batch size and want {} minibatches, can't split evenly".format(
            nbatch, nminibatches)
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632

        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfos])
        eprewmean = safemean([epinfo['r'] for epinfo in epinfos])
        rew_per_step = eprewmean / eplenmean

        print("Curr learning rate {} \t Curr reward per step {}".format(
            lrnow, rew_per_step))

        if rew_per_step > best_rew_per_step and early_stopping:
            # Avoid updating best model at first iteration because the means might be a bit off because
            # of how the multithreaded batch simulation works
            best_rew_per_step = eprewmean / eplenmean
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            model.save(checkdir + ".temp_best_model")
            print("Saved model as best", best_rew_per_step, "avg rew/step")

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in tqdm.trange(0,
                                         nbatch,
                                         nbatch_train,
                                         desc="{}/{}".format(_, noptepochs)):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))

        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))

            eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf])
            ep_dense_rew_mean = safemean(
                [epinfo['ep_shaped_r'] for epinfo in epinfobuf])
            ep_sparse_rew_mean = safemean(
                [epinfo['ep_sparse_r'] for epinfo in epinfobuf])
            eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfobuf])
            run_info['eprewmean'].append(eprewmean)
            run_info['ep_dense_rew_mean'].append(ep_dense_rew_mean)
            run_info['ep_sparse_rew_mean'].append(ep_sparse_rew_mean)
            run_info['eplenmean'].append(eplenmean)
            run_info['explained_variance'].append(float(ev))

            logger.logkv(
                'true_eprew',
                safemean([epinfo['ep_sparse_r'] for epinfo in epinfobuf]))
            logger.logkv('eprewmean', eprewmean)
            logger.logkv('eplenmean', eplenmean)
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))

            time_elapsed = tnow - tfirststart
            logger.logkv('time_elapsed', time_elapsed)

            time_per_update = time_elapsed / update
            time_remaining = (nupdates - update) * time_per_update
            logger.logkv('time_remaining', time_remaining / 60)

            for (lossval, lossname) in zip(lossvals, model.loss_names):
                run_info[lossname].append(lossval)

                logger.logkv(lossname, lossval)

            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()

            # Update current logs
            if additional_params["RUN_TYPE"] in ["ppo", "joint_ppo"]:
                from overcooked_ai_py.utils import save_dict_to_file
                save_dict_to_file(run_info,
                                  additional_params["SAVE_DIR"] + "logs")

                # Linear annealing of reward shaping
                if additional_params["REW_SHAPING_HORIZON"] != 0:
                    # Piecewise linear annealing schedule
                    # annealing_thresh: until when we should stop doing 100% reward shaping
                    # annealing_horizon: when we should reach doing 0% reward shaping
                    annealing_horizon = additional_params[
                        "REW_SHAPING_HORIZON"]
                    annealing_thresh = 0

                    def fn(x):
                        if annealing_thresh != 0 and annealing_thresh - (
                                annealing_horizon / annealing_thresh) * x > 1:
                            return 1
                        else:
                            fn = lambda x: -1 * (x - annealing_thresh) * 1 / (
                                annealing_horizon - annealing_thresh) + 1
                            return max(fn(x), 0)

                    curr_timestep = update * nbatch
                    curr_reward_shaping = fn(curr_timestep)
                    env.update_reward_shaping_param(curr_reward_shaping)
                    print("Current reward shaping", curr_reward_shaping)

                sp_horizon = additional_params["SELF_PLAY_HORIZON"]

                # Save/overwrite best model if past a certain threshold
                if ep_sparse_rew_mean > bestrew and ep_sparse_rew_mean > additional_params[
                        "SAVE_BEST_THRESH"]:
                    # Don't save best model if still doing some self play and it's supposed to be a BC model
                    if additional_params[
                            "OTHER_AGENT_TYPE"][:
                                                2] == "bc" and sp_horizon != 0 and env.self_play_randomization > 0:
                        pass
                    else:
                        from human_aware_rl.ppo.ppo import save_ppo_model
                        print("BEST REW", ep_sparse_rew_mean,
                              "overwriting previous model with", bestrew)
                        save_ppo_model(
                            model, "{}seed{}/best".format(
                                additional_params["SAVE_DIR"],
                                additional_params["CURR_SEED"]))
                        bestrew = max(ep_sparse_rew_mean, bestrew)

                # If not sp run, and horizon is not None,
                # vary amount of self play over time, either with a sigmoidal feedback loop
                # or with a fixed piecewise linear schedule.
                if additional_params[
                        "OTHER_AGENT_TYPE"] != "sp" and sp_horizon is not None:
                    if type(sp_horizon) is not list:
                        # Sigmoid self-play schedule based on current performance (not recommended)
                        curr_reward = ep_sparse_rew_mean

                        rew_target = sp_horizon
                        shift = rew_target / 2
                        t = (1 / rew_target) * 10
                        fn = lambda x: -1 * (np.exp(t * (x - shift)) /
                                             (1 + np.exp(t * (x - shift)))) + 1

                        env.self_play_randomization = fn(curr_reward)
                        print("Current self-play randomization",
                              env.self_play_randomization)
                    else:
                        assert len(sp_horizon) == 2
                        # Piecewise linear self-play schedule

                        # self_play_thresh: when we should stop doing 100% self-play
                        # self_play_timeline: when we should reach doing 0% self-play
                        self_play_thresh, self_play_timeline = sp_horizon

                        def fn(x):
                            if self_play_thresh != 0 and self_play_timeline - (
                                    self_play_timeline /
                                    self_play_thresh) * x > 1:
                                return 1
                            else:
                                fn = lambda x: -1 * (
                                    x - self_play_thresh) * 1 / (
                                        self_play_timeline - self_play_thresh
                                    ) + 1
                                return max(fn(x), 0)

                        curr_timestep = update * nbatch
                        env.self_play_randomization = fn(curr_timestep)
                        print("Current self-play randomization",
                              env.self_play_randomization)

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

        # Visualization of rollouts with actual other agent
        run_type = additional_params["RUN_TYPE"]
        if run_type in ["ppo", "joint_ppo"
                        ] and update % additional_params["VIZ_FREQUENCY"] == 0:
            from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv
            from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
            from overcooked_ai_py.agents.agent import AgentPair
            from overcooked_ai_py.agents.benchmarking import AgentEvaluator
            from human_aware_rl.baselines_utils import get_agent_from_model
            print(additional_params["SAVE_DIR"])

            mdp = OvercookedGridworld.from_layout_name(
                **additional_params["mdp_params"])
            overcooked_env = OvercookedEnv(mdp,
                                           **additional_params["env_params"])
            agent = get_agent_from_model(
                model,
                additional_params["sim_threads"],
                is_joint_action=(run_type == "joint_ppo"))
            agent.set_mdp(mdp)

            if run_type == "ppo":
                if additional_params["OTHER_AGENT_TYPE"] == 'sp':
                    agent_pair = AgentPair(agent,
                                           agent,
                                           allow_duplicate_agents=True)
                else:
                    print("PPO agent on index 0:")
                    env.other_agent.set_mdp(mdp)
                    agent_pair = AgentPair(agent, env.other_agent)
                    trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents(
                        agent_pair, display=True, display_until=100)
                    overcooked_env.reset()
                    agent_pair.reset()
                    print("tot rew", tot_rewards, "tot rew shaped",
                          tot_shaped_rewards)

                    print("PPO agent on index 1:")
                    agent_pair = AgentPair(env.other_agent, agent)

            else:
                agent_pair = AgentPair(agent)

            trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents(
                agent_pair, display=True, display_until=100)
            overcooked_env.reset()
            agent_pair.reset()
            print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards)
            print(additional_params["SAVE_DIR"])

    if nupdates > 0 and early_stopping:
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        print("Loaded best model", best_rew_per_step)
        model.load(checkdir + ".temp_best_model")
    return model, run_info