Exemple #1
0
    def __init__(self, net_file):
        """Trying to mirror the IMIT agent's loading method.

        When creating the actor, the TF is not actually created, for that we
        need placeholders and then to 'call' the actor.
        """
        self.actor = Actor(nb_actions=4,
                           name='actor',
                           network='cloth_cnn',
                           use_keras=False)
        self.net_file = net_file

        # Exactly same as in the imit/imit_learner code, create actor network.
        self.observation_shape = (100, 100, 3)
        shape = (None, ) + self.observation_shape
        self.obs0 = tf.placeholder(tf.int32, shape=shape, name='obs0_imgs')
        self.obs0_f_imgs = tf.cast(self.obs0, tf.float32) / 255.0
        self.actor_tf = self.actor(self.obs0_f_imgs)

        # Handle miscellaneous TF stuff.
        self.sess = U.get_session()
        self.sess.run(tf.global_variables_initializer())
        _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        print('\nThe TF variables after init:\n')
        for vv in _vars:
            print('  {}'.format(vv))
        print('\nBaselines debugging:')
        U.display_var_info(_vars)
        print('\nNow let\'s call U.load_variables() ...')

        # Our imit code loads after initializing variables.
        U.load_variables(load_path=net_file, sess=self.sess)
        self.sess.graph.finalize()
 def print_model_details(self):
     U.display_var_info(self.get_trainable_variables())
Exemple #3
0
def print_trainable_variables():
    display_var_info(tf.trainable_variables())
Exemple #4
0
def learn(network, env,
          seed=None,
          total_timesteps=None,
          nb_epochs=None, # with default settings, perform 1M steps total
          nb_epoch_cycles=20,
          nb_rollout_steps=100,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          noise_type='adaptive-param_0.2',
          normalize_returns=False,
          normalize_observations=True,
          actor_l2_reg=0.0,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=50, # per epoch cycle and MPI worker,
          nb_eval_steps=1000,
          batch_size=64, # per MPI worker
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          rb_size=1e6,
          save_interval=5,  # reduce memory footprint
          bc_epochs=0,
          load_path=None,
          demos_path=None,
          bc_teacher_lambda=0.0,
          use_qfilter=False,
          **network_kwargs):
    """Learns policy using imitation (maybe DAgger) w/vectorized environments.

    If we pass other arguments that aren't specified here, they are considered
    as network_kwargs.

    Parameters
    ----------
    noise_type: for noise to be added to the behavior policy. They are NOT
        using the noise type from the paper but 'AdaptiveParamNoiseSpec'. I
        _think_ that if one does the OU process, we get action noise, but not
        parameter noise. Also, be sure to use `name_stdev` in that convention,
        as the code will split the argument at the underscores.

    actor_lr: 1e-4  (matches paper)

    critic_lr: 1e-3  (matches paper)

    critic_l2: 1e-2  (matches paper)

    gamma: 0.99  (matches paper)

    batch_size: 64  (matches paper for lower-dim env obs/states)

    tau: 0.01 for soft target updates of actor and critic nets. Paper used 0.001.

    nb_epoch_cycles: number of times we go through this cycle of: (1) get
        rollouts with noise added to policy and apply to replay buffer, (2)
        gradient updates for actor/critic, (3) evaluation rollouts (if any).
        AFTER all of these cycles happen, THEN we log statistics.

    nb_rollout_steps: number of steps in each parallel env we take with
        exploration policy without training, so this is just to populate the
        replay buffer. More parallel envs *should* mean that we get more
        samples in the buffer between each gradient updates of the network, so
        this might need to be environment *and* machine (# of CPUs) specific.

    nb_train_steps: after doing `nb_rollout_steps` in each parallel env, we do
        this many updates; each involves sampling from the replay buffer and
        updating the actor and critic (via lagged target updates).

    nb_eval_steps: 1000, I changed from the 100 as default. Using 1000 ensures
        that fixed length envs like Ant-v2 can get one full episode (assuming
        one parallel env) during evaluation stagtes.

    eval_env: A separate environment for evaluation only, where no noise is
        applied, similar to how rlkit does it.

    save_interval: Frequency between saving.
    """
    set_global_seeds(seed)

    # Daniel: NOTE/TODO testing, if I can.
    USE_KERAS = False
    # Daniel: should be False unless I'm doing some testing.
    do_valid_tests = False

    # Daniel: this helps to maintain compatibility with PPO2 code. For now
    # we're ignoring it, but we should check that we're always clipping. I
    # changed the nb_epochs to match with PPO2 in that we divide by nenvs.
    if 'limit_act_range' in network_kwargs:
        network_kwargs.pop('limit_act_range')
    nenvs = env.num_envs

    nbatchsize = nenvs * nb_epoch_cycles * nb_rollout_steps
    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // nbatchsize
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    # we assume symmetric actions.
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high).all()

    # Form XP (1M steps, same as in paper), and ONLY ACTOR net, no critic.
    # Daniel: force dtype here so we can use uint8 type images.
    assert env.observation_space.low.dtype == env.observation_space.high.dtype
    # Also changing to (100,100) unless we do keras/pretraining, to force smaller images.
    if USE_KERAS:
        obs_shape = env.observation_space.shape
    else:
        obs_shape = (100,100,4)

    memory = Memory(limit=int(rb_size),
                    action_shape=env.action_space.shape,
                    observation_shape=obs_shape,
                    dtype=env.observation_space.low.dtype,
                    do_valid=do_valid_tests)
    actor = Actor(nb_actions, network=network, use_keras=USE_KERAS, **network_kwargs)

    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))

    # The `learn` defaults above have priority over defaults in IMIT class.
    agent = IMIT(actor=actor,
                 memory=memory,
                 observation_shape=obs_shape,
                 action_shape=env.action_space.shape,
                 batch_size=batch_size,
                 actor_l2_reg=actor_l2_reg,
                 actor_lr=actor_lr,
                 use_keras=USE_KERAS)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Prepare everything.
    sess = U.get_session()
    agent.initialize(sess)
    # --------------------------------------------------------------------------
    # Daniel: similar as PPO2 code as `agent` is similar to `model` but has to
    # be initialized explicitly above. Must call after `agent.load` gets
    # created.  Not sure if this works with parameter space noise or with
    # normalization, but I don't plan to resume training (for now). It also has
    # to be *before* the `graph.finalize()` because otherwise we get an error.
    # --------------------------------------------------------------------------
    if load_path is not None:
        logger.info("\nInside IMIT, loading model from: {}".format(load_path))
        agent.load(load_path)
    # --------------------------------------------------------------------------
    sess.graph.finalize()

    # --------------------------------------------------------------------------
    # Daniel: populate replay buffer, followed by behavior cloning stage.
    # But if load_path is not None, then doesn't make sense -- we want to load.
    # We also don't need to do this if timesteps is 0 (e.g., for playing policy).
    # --------------------------------------------------------------------------
    # OK now we're assuming we do DAgger by default. If we don't want to do
    # DAgger (i.e., pure BC) then let's set time_steps=1 for training.
    # --------------------------------------------------------------------------
    if total_timesteps == 0:
        return agent
    assert seed == 1500, 'We normally want seed 1500, yet: {}'.format(seed)

    if (demos_path is not None and load_path is None):
        _ddpg_demos(demos_path, agent, memory, algo='IMIT')
        assert memory.nb_entries == memory.nb_teach_entries, memory.nb_entries
        if do_valid_tests:
            memory.set_valid_idx()
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        statsdir = osp.join(logger.get_dir(), 'pretrain_stats')
        os.makedirs(checkdir, exist_ok=True)
        os.makedirs(statsdir, exist_ok=True)

        # Pretrain, based on their training code for some # of minibatches.
        pt_actor_losses = []
        pt_actor_losses_l2 = []
        batches_per_ep = int(memory.nb_entries / batch_size)
        logger.info('Running BC for {} epochs'.format(bc_epochs))
        logger.info('  data size in memory: {}'.format(memory.nb_entries))
        logger.info('  each batch: {}, epoch mbs: {}'.format(batch_size, batches_per_ep))
        if do_valid_tests:
            logger.info('  memory valid idx: {}'.format(memory.nb_valid_items))
        pt_start = time.time()

        for epoch in range(1,bc_epochs+1):
            losses = [] # includes L2 fyi
            losses_l2 = []
            for _ in range(batches_per_ep):
                al, al_l2 = agent.train(return_l2=True)
                losses.append(al)
                losses_l2.append(al_l2)
            pt_actor_losses.append( np.mean(losses) )
            pt_actor_losses_l2.append( np.mean(losses_l2) )

            # Check and save model occasionally.
            if epoch == 1 or epoch % 10 == 0:
                pt_time = (time.time() - pt_start) / 60.
                logger.info('  epoch done: {}, loss over past epoch: {:.4f} (l2: {:.4f})'.format(
                        str(epoch).zfill(4), pt_actor_losses[-1], pt_actor_losses_l2[-1]))
                logger.info('  elapsed time: {:.1f}m'.format(pt_time))
                savepath = osp.join(checkdir, 'bc_epoch_{}'.format(str(epoch).zfill(4)))
                logger.info('Saving model checkpoint to: ', savepath)
                agent.save(savepath)

            # Do validation here.
            if do_valid_tests:
                num_mbs = int(memory.nb_valid_items / batch_size)
                l2_errors = []
                for mb in range(num_mbs):
                    res = memory.get_valid_obs(mb*batch_size, (mb+1)*batch_size)
                    valid_obs = res['obs0']
                    valid_act = res['actions']
                    assert valid_obs.shape == (batch_size,100,100,3), valid_obs.shape
                    acts, _, _, _ = agent.step(obs=valid_obs, apply_noise=False)
                    l2_err_vector = np.mean((valid_act - acts)**2, axis=1)
                    l2_errors.extend(l2_err_vector)

                # Last minibatch
                res = memory.get_valid_obs((mb+1)*batch_size, memory.nb_valid_items)
                valid_obs = res['obs0']
                valid_act = res['actions']
                acts, _, _, _ = agent.step(obs=valid_obs, apply_noise=False)
                l2_err_vector = np.mean((valid_act - acts)**2, axis=1)
                l2_errors.extend(l2_err_vector)

                l2_err_valid = np.mean(l2_errors)
                logger.log('  VALIDATION L2 error: {:.4f}'.format(l2_err_valid))

        pt_time = (time.time() - pt_start) / 60.
        logger.info('losses a: {}'.format(np.array(pt_actor_losses)))
        logger.info('losses a (l2 norm of weights): {}'.format(np.array(pt_actor_losses_l2)))
        losses_pth = osp.join(statsdir, 'bc_losses.pkl')
        losses_l2_pth = osp.join(statsdir, 'bc_losses_l2_only.pkl')
        with open(losses_pth, 'wb') as fh:
            pickle.dump(losses_pth, fh)
        with open(losses_l2_pth, 'wb') as fh:
            pickle.dump(losses_l2_pth, fh)
        logger.info('Finished BC (no DAgger) in {:.1f}m.\n'.format(pt_time))
    # --------------------------------------------------------------------------

    # Back to their code. For cloth, `env.reset()` takes a while so we put it here.
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    # Daniel: this is how to get the cloth state for the teacher; depends on num_env.
    logger.info('nenvs={}, getting cloth points + teacher policy...'.format(nenvs))
    if nenvs == 1:
        # DummyVecEnv has an `envs` "list". Then an extra `.env` to get ClothEnv.
        cloth_env = (env.envs[0]).env
        pts = (cloth_env).cloth.pts
        logger.info('singleton env type: {}'.format(cloth_env))
        logger.info('len(points): {}'.format(len(pts)))
        teacher = OracleCornerPolicy()
        teacher.set_env_data_dummy(cloth_env)
        logger.info('teacher attributes: {}'.format(teacher.get_info()))
        teacher_list = [ teacher ]
    else:
        # SubprocVecEnv, not sure if we can obtain envs or if it's safe, so I did this.
        env_attr = env.get_cloth_attributes()
        logger.info('env attributes: {}'.format(env_attr))
        teacher_list = []
        assert len(env_attr) == nenvs, len(env_attr)
        for env_a in env_attr:
            teacher = OracleCornerPolicy()
            teacher.set_env_data_subproc(env_a[0], env_a[1], env_a[2])
            teacher_list.append(teacher)

    # Daniel: Debugging/sanity checks.
    _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    U.display_var_info(_variables)
    logger.info("\nInside IMIT, about to start epochs")
    logger.info("nbatchsize: {}, get this in buffer before IMIT updates".format(nbatchsize))
    logger.info("  i.e.: (nenv {}) * (cycles {}) * (nsteps {})".format(
            nenvs, nb_epoch_cycles, nb_rollout_steps))
    logger.info("nb_epochs: {}, number of cycles to use".format(nb_epochs))
    logger.info("eval_env None? {}".format(eval_env is None))
    logger.info("(end of debugging messages)\n")

    # File paths.
    checkdir    = osp.join(logger.get_dir(), 'checkpoints')
    action_dir  = osp.join(logger.get_dir(), 'actions')
    episode_dir = osp.join(logger.get_dir(), 'ep_all_infos')
    os.makedirs(checkdir, exist_ok=True)
    os.makedirs(action_dir, exist_ok=True)
    os.makedirs(episode_dir, exist_ok=True)

    # Daniel: use these two to store past 100 episode history. Report these stats!
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    all_eval_episode_rewards = []

    # reward/step: cumulative quantities for each episode in vecenv.
    # epoch_{actions,qs} will grow without bound, fyi.
    episode_reward = np.zeros(nenvs, dtype = np.float32) #vector
    episode_step = np.zeros(nenvs, dtype = int) # vector
    episodes = 0 #scalar
    t = 0 # scalar
    epoch = 0
    start_time = time.time()
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_episodes = 0

    for epoch in range(nb_epochs):
        mb_actions = []
        mb_epinfos = []
        for cycle in range(nb_epoch_cycles):
            # Daniel: pure data collection (NO noise added) to populate replay buffer.
            # No training until after this, and note the parallel stepping (VecEnv).

            for t_rollout in range(nb_rollout_steps):
                # Predict next action: (#_parallel_envs, ac_dim).
                action, _, _, _ = agent.step(obs)
                if rank == 0 and render:
                    env.render()

                # Before environment stepping happens, we need to run the teacher
                # policy here, because it needs the SAME EXACT env state. The policy
                # does not apply on the obs but the INTERNAL env.cloth.pts.
                t_actions = []
                if nenvs > 1:
                    cloth_objs = env.get_cloth_objs()
                    for teacher, cloth in zip(teacher_list, cloth_objs):
                        t_act = teacher.get_action(cloth=cloth)
                        t_actions.append(t_act)
                else:
                    for teacher in teacher_list:
                        t_act = teacher.get_action()
                        t_actions.append(t_act)
                t_actions = np.array(t_actions)
                t_actions = np.maximum( np.minimum(t_actions,1.0), -1.0)
                logger.info('agent actions:\n{}'.format(action))
                logger.info('teacher actions:\n{}'.format(t_actions))
                logger.info('L2 diff: \n{:.4f}'.format(
                        np.mean( np.mean((action-t_actions)**2, axis=1) )
                ))

                # max_action is of dimension A, whereas action is dimension
                # (nenvs, A) - the multiplication gets broadcasted to the batch
                # scale for execution in env (as far as DDPG is concerned,
                # every action is in [-1, 1])
                new_obs, r, done, info = env.step(max_action * action)
                r = r.astype(np.float32)
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1
                epoch_actions.append(action)

                # Daniel: Same as PPO2/DDPG, just checking for end of episodes.
                mb_actions.append(action)
                for inf in info:
                    maybeepinfo = inf.get('episode')
                    if maybeepinfo:
                        mb_epinfos.append(inf)

                # The batched data will be unrolled in memory.py's append. Daniel:
                # unlike DDPG, we only need obs/act, but act is FROM THE EXPERT.
                # Unfortunately there will be duplicate obs/act pairs if the student
                # actions don't touch the cloth, but I'm not sure how to avoid that.
                agent.store_transition(obs, t_actions)
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])    # Entire history
                        episode_rewards_history.append(episode_reward[d])  # Last 100 only
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1

            # Train.
            epoch_actor_losses = []
            epoch_actor_losses_l2 = []
            for t_train in range(nb_train_steps):
                al, al_l2 = agent.train(return_l2=True)
                epoch_actor_losses.append(al)
                epoch_actor_losses_l2.append(al_l2)

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        combined_stats = {}
        combined_stats['memory/nb_entries'] = memory.nb_entries
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_actor_l2'] = np.mean(epoch_actor_losses_l2)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s'%x)

        # Total statistics.
        combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)
        combined_stats = {k : v / mpi_size for (k,v) in
                zip(combined_stats.keys(), combined_stats_sums)}
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps_per_env'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)

        # Daniel: we can use cycle or epoch for this if condition ... kind of annoying but w/e.
        if cycle % save_interval == 0:
            logger.info('We are now saving stuff!!')
            savepath = osp.join(checkdir, '%.5i'%epoch)
            logger.info('Saving model checkpoint to: ', savepath)
            agent.save(savepath)
            # ------------------------------------------------------------------
            # Daniel: extra stuff for debugging.
            mb_actions = _sf01(np.asarray(mb_actions))
            act_savepath = osp.join(action_dir, 'actions_%.5i.pkl'%epoch)
            epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl'%epoch)
            with open(act_savepath, 'wb') as fh:
                pickle.dump(mb_actions, fh)
            with open(epi_savepath, 'wb') as fh:
                pickle.dump(mb_epinfos, fh)

        # Daniel: we were not resetting earlier. Actually there are other
        # epoch_stats which we might consider resetting here?
        epoch_episodes = 0

    return agent
Exemple #5
0
def learn(*, network, env, total_timesteps, eval_env = None,
            seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, load_path=None, model_fn=None, **network_kwargs):
    '''Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)
                                      Daniel: should be `T` in the paper. Atari defaults are 128 as in the paper.

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective
                                      Daniel: 0.5 by default but the PPO paper uses 1.0 for Atari games.

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update
                                      Daniel: 4 by default but the PPO paper uses 3 (for Atari games), etc.

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training. Daniel: for `tf.clip_by_value(ratio, 1-cliprange, 1+cliprange)`

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
    '''
    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    # Daniel: hacky within PPO2 solution for limiting action ranges.
    if 'limit_act_range' in network_kwargs:
        limit_act_range = network_kwargs['limit_act_range']
        network_kwargs.pop('limit_act_range')
    else:
        limit_act_range = False

    policy = build_policy(env, network, limit_act_range=limit_act_range, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space,
                     nbatch_act=nenvs, nbatch_train=nbatch_train,
                     nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm)

    if load_path is not None:
        logger.info("\nInside ppo2, loading model from: {}".format(load_path))
        model.load(load_path)

    # Daniel: debugging and sanity checks
    _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    tf_util.display_var_info(_variables)

    # Instantiate the runner object (Daniel: calls `env.reset()` so can take a while for cloth)
    # Also, I'm going to assume that if total_timesteps=0 then we don't waste time creating this.
    if total_timesteps > 0:
        runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()
    nupdates = total_timesteps//nbatch

    # Daniel: debugging and sanity checks
    logger.info("\nInside ppo2, before updates (`env.reset()` called before this)")
    logger.info("  nsteps: {}, each env in VecEnv does this many to get minibatch".format(nsteps))
    logger.info("  nbatch: {}, i.e., nsteps * nenv, size of data from (get_minibatch)".format(nbatch))
    logger.info("  nbatch_train: {}, batch size for actual gradient update within epoch".format(nbatch_train))
    logger.info("  noptepochs: {}, number of epochs over collected minibatch for PPO updates".format(noptepochs))
    logger.info("  nupdates: {}, number of (get_minibatch, update_net) cycles".format(nupdates))
    logger.info("  our model_fn class: {}".format(model_fn))
    logger.info("(end of debugging messages)\n")

    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos, ep_all_infos = runner.run() #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos, eval_ep_all_infos = eval_runner.run() #pylint: disable=E0632

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None: # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update*nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update*nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) )
                logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) )
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%update)
            logger.info('Saving model checkpoint to: ', savepath)
            model.save(savepath)
            # ------------------------------------------------------------------
            # Daniel: extra stuff for debugging PPO on cloth, actions and infos for each episode.
            logstd_vals = model.act_model.get_logstd_values()
            action_dir  = osp.join(logger.get_dir(), 'actions')
            episode_dir = osp.join(logger.get_dir(), 'ep_all_infos')
            logstd_dir  = osp.join(logger.get_dir(), 'logstd')
            os.makedirs(action_dir, exist_ok=True)
            os.makedirs(episode_dir, exist_ok=True)
            os.makedirs(logstd_dir, exist_ok=True)
            act_savepath = osp.join(action_dir, 'actions_%.5i.pkl'%update)
            epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl'%update)
            std_savepath = osp.join(logstd_dir, 'logstd_%.5i.pkl'%update)
            with open(act_savepath, 'wb') as fh:
                pickle.dump(actions, fh)
            with open(epi_savepath, 'wb') as fh:
                pickle.dump(ep_all_infos, fh)
            with open(std_savepath, 'wb') as fh:
                pickle.dump(logstd_vals, fh)
            # ------------------------------------------------------------------
    return model
Exemple #6
0
def learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        actor_l2_reg=0.0,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=1000,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        rb_size=1e6,
        save_interval=1,
        pretrain_epochs=0,
        load_path=None,
        demos_path=None,
        bc_teacher_lambda=0.0,
        use_qfilter=False,
        **network_kwargs):
    """Learns policy using DDPG, with vectorized environments.

    If we pass other arguments that aren't specified here, they are considered
    as network_kwargs.

    Parameters
    ----------
    noise_type: for noise to be added to the behavior policy. They are NOT
        using the noise type from the paper but 'AdaptiveParamNoiseSpec'. I
        _think_ that if one does the OU process, we get action noise, but not
        parameter noise. Also, be sure to use `name_stdev` in that convention,
        as the code will split the argument at the underscores.

    actor_lr: 1e-4  (matches paper)

    critic_lr: 1e-3  (matches paper)

    critic_l2: 1e-2  (matches paper)

    gamma: 0.99  (matches paper)

    batch_size: 64  (matches paper for lower-dim env obs/states)

    tau: 0.01 for soft target updates of actor and critic nets. Paper used 0.001.

    nb_epoch_cycles: number of times we go through this cycle of: (1) get
        rollouts with noise added to policy and apply to replay buffer, (2)
        gradient updates for actor/critic, (3) evaluation rollouts (if any).
        AFTER all of these cycles happen, THEN we log statistics.

    nb_rollout_steps: number of steps in each parallel env we take with
        exploration policy without training, so this is just to populate the
        replay buffer. More parallel envs *should* mean that we get more
        samples in the buffer between each gradient updates of the network, so
        this might need to be environment *and* machine (# of CPUs) specific.

    nb_train_steps: after doing `nb_rollout_steps` in each parallel env, we do
        this many updates; each involves sampling from the replay buffer and
        updating the actor and critic (via lagged target updates).

    nb_eval_steps: 1000, I changed from the 100 as default. Using 1000 ensures
        that fixed length envs like Ant-v2 can get one full episode (assuming
        one parallel env) during evaluation stagtes.

    eval_env: A separate environment for evaluation only, where no noise is
        applied, similar to how rlkit does it.

    save_interval: Frequency between saving.
    """
    set_global_seeds(seed)

    # Daniel: this helps to maintain compatibility with PPO2 code. For now
    # we're ignoring it, but we should check that we're always clipping. I
    # changed the nb_epochs to match with PPO2 in that we divide by nenvs.
    if 'limit_act_range' in network_kwargs:
        network_kwargs.pop('limit_act_range')
    nenvs = env.num_envs

    nbatchsize = nenvs * nb_epoch_cycles * nb_rollout_steps
    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // nbatchsize
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    # we assume symmetric actions.
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high).all()

    # Form XP (1M steps, same as in paper), and critic/actor networks.
    # Daniel: force dtype here so we can use uint8 type images.
    assert env.observation_space.low.dtype == env.observation_space.high.dtype
    memory = Memory(limit=int(rb_size),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape,
                    dtype=env.observation_space.low.dtype)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                #action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                #                                 sigma=float(stddev)*np.ones(nb_actions))
                #if nenvs > 1:
                # Daniel: adding this to replace the former.
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions),
                                                 shape=(nenvs, nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    # The `learn` defaults above have priority over defaults in DDPG class.
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 actor_l2_reg=actor_l2_reg,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 bc_teacher_lambda=bc_teacher_lambda,
                 use_qfilter=use_qfilter)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Prepare everything.
    sess = U.get_session()
    agent.initialize(sess)
    # --------------------------------------------------------------------------
    # Daniel: similar as PPO2 code as `agent` is similar to `model` but has to
    # be initialized explicitly above. Must call after `agent.load` gets
    # created.  Not sure if this works with parameter space noise or with
    # normalization, but I don't plan to resume training (for now). It also has
    # to be *before* the `graph.finalize()` because otherwise we get an error.
    # --------------------------------------------------------------------------
    if load_path is not None:
        logger.info("\nInside ddpg, loading model from: {}".format(load_path))
        agent.load(load_path)
    # --------------------------------------------------------------------------
    sess.graph.finalize()
    agent.reset()

    # --------------------------------------------------------------------------
    # Daniel: populate replay buffer, followed by pretraining stage.
    # But if load_path is not None, then doesn't make sense -- we want to load.
    # We also don't need to do this if timesteps is 0 (e.g., for playing policy).
    # --------------------------------------------------------------------------
    if total_timesteps == 0:
        return agent
    assert seed == 1500, 'We normally want seed 1500, yet: {}'.format(seed)

    if (demos_path is not None and load_path is None):
        _ddpg_demos(demos_path, agent, memory)
        assert memory.nb_entries == memory.nb_teach_entries, memory.nb_entries
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)

        # Pretrain, based on their training code for some # of minibatches.
        pt_actor_losses = []
        pt_critic_losses = []
        batches_per_ep = int(memory.nb_entries / batch_size)
        logger.info(
            'Running pre-training for {} epochs'.format(pretrain_epochs))
        logger.info('  data size in memory: {}'.format(memory.nb_entries))
        logger.info('  each batch: {}, epoch mbs: {}'.format(
            batch_size, batches_per_ep))
        pt_start = time.time()

        for epoch in range(1, pretrain_epochs + 1):
            c_losses = []
            a_losses = []
            for _ in range(batches_per_ep):
                cl, al = agent.train(during_pretrain=True)
                agent.update_target_net()
                c_losses.append(cl)
                a_losses.append(al)
            pt_critic_losses.append(np.mean(c_losses))
            pt_actor_losses.append(np.mean(a_losses))

            # Check and save model occasionally.
            if epoch == 1 or epoch % 5 == 0:
                pt_time = (time.time() - pt_start) / 60.
                logger.info(
                    '  epoch done: {}, loss over past epoch: {:.4f}'.format(
                        str(epoch).zfill(4), pt_actor_losses[-1]))
                logger.info('  critic loss over past epoch: {:.4f}'.format(
                    pt_critic_losses[-1]))
                logger.info('  elapsed time: {:.1f}m'.format(pt_time))
                savepath = osp.join(
                    checkdir, 'pretrain_epoch_{}'.format(str(epoch).zfill(4)))
                logger.info('Saving model checkpoint to: ', savepath)
                agent.save(savepath)

        pt_time = (time.time() - pt_start) / 60.
        logger.info('losses a: {}'.format(np.array(pt_actor_losses)))
        logger.info('losses c: {}'.format(np.array(pt_critic_losses)))
        logger.info('Finished loading teacher samples + pre-training.')
        logger.info('Pre-training took {:.1f}m.\n'.format(pt_time))
    # --------------------------------------------------------------------------

    # Back to their code. For cloth, `env.reset()` takes a while so we put it here.
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    # Daniel: Debugging/sanity checks.
    _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    U.display_var_info(_variables)
    logger.info("\nInside DDPG, about to start epochs")
    logger.info(
        "nbatchsize: {}, get this in buffer before DDPG updates".format(
            nbatchsize))
    logger.info("  i.e.: (nenv {}) * (cycles {}) * (nsteps {})".format(
        nenvs, nb_epoch_cycles, nb_rollout_steps))
    logger.info("nb_epochs: {}, number of cycles to use".format(nb_epochs))
    logger.info("eval_env None? {}".format(eval_env is None))
    logger.info("(end of debugging messages)\n")

    # File paths.
    checkdir = osp.join(logger.get_dir(), 'checkpoints')
    action_dir = osp.join(logger.get_dir(), 'actions')
    episode_dir = osp.join(logger.get_dir(), 'ep_all_infos')
    os.makedirs(checkdir, exist_ok=True)
    os.makedirs(action_dir, exist_ok=True)
    os.makedirs(episode_dir, exist_ok=True)

    # Daniel: use these two to store past 100 episode history. Report these stats!
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    all_eval_episode_rewards = []

    # reward/step: cumulative quantities for each episode in vecenv.
    # epoch_{actions,qs} will grow without bound, fyi.
    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    epoch = 0
    start_time = time.time()
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0

    for epoch in range(nb_epochs):
        mb_actions = []
        mb_epinfos = []
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset
                # agent at the end of the episode in each of the environments,
                # so resetting here instead
                agent.reset()

            # Daniel: pure data collection (noise added) to populate replay buffer.
            # No training until after this, and note the parallel stepping (VecEnv).
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                # action: (#_parallel_envs, ac_dim), q: (#_parallel_envs, 1)
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension
                # (nenvs, A) - the multiplication gets broadcasted to the batch
                # scale for execution in env (as far as DDPG is concerned,
                # every action is in [-1, 1])
                new_obs, r, done, info = env.step(max_action * action)
                r = r.astype(np.float32)
                # note these outputs are batched from vecenv (first dim = batch).
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping. (Daniel: agent.train() doesn't require these two lists)
                epoch_actions.append(action)
                epoch_qs.append(q)
                # Daniel: Same as PPO2 code.
                mb_actions.append(action)
                for inf in info:
                    maybeepinfo = inf.get('episode')
                    if maybeepinfo:
                        mb_epinfos.append(inf)
                #the batched data will be unrolled in memory.py's append.
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(
                            episode_reward[d])  # Entire history
                        episode_rewards_history.append(
                            episode_reward[d])  # Last 100 only
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate. (Daniel: note that no noise is applied here.)
            # Also it seems like episodes do not naturally reset before this starts?
            # Also, unlike epoch_episode_reward, here we create eval_episode_reward here ...
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                logger.info('Now on the eval_env for {} steps...'.format(
                    nb_eval_steps))
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # scale for execution in env (for DDPG, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            all_eval_episode_rewards.append(
                                eval_episode_reward[d])
                            eval_episode_reward[
                                d] = 0.0  # Daniel: reset for next episode.

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['memory/nb_entries'] = memory.nb_entries
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics. (Daniel: use eval/return_history for plots)
        if eval_env is not None:
            combined_stats['eval/return'] = np.mean(eval_episode_rewards)
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = np.array(
            [np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps_per_env'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(osp.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(osp.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
            # Daniel: arguable, we can save all episodes but hard if we don't know the steps.
            #if eval_env:
            #    with open(os.path.join(logdir, 'all_eval_episode_rewards.pkl'), 'wb') as f:
            #        pickle.dump(all_eval_episode_rewards, f)

        # Daniel: we can use cycle or epoch for this if condition ... kind of annoying but w/e.
        if cycle % save_interval == 0:
            logger.info('We are now saving stuff!!')
            savepath = osp.join(checkdir, '%.5i' % epoch)
            logger.info('Saving model checkpoint to: ', savepath)
            agent.save(savepath)
            # ------------------------------------------------------------------
            # Daniel: extra stuff for debugging PPO on cloth, actions and infos for each episode.
            mb_actions = _sf01(np.asarray(mb_actions))
            act_savepath = osp.join(action_dir, 'actions_%.5i.pkl' % epoch)
            epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl' % epoch)
            with open(act_savepath, 'wb') as fh:
                pickle.dump(mb_actions, fh)
            with open(epi_savepath, 'wb') as fh:
                pickle.dump(mb_epinfos, fh)

        # Daniel: we were not resetting earlier. Actually there are other
        # epoch_stats which we might consider resetting here?
        epoch_episodes = 0

    return agent