Exemple #1
0
    def setup_replay_buffer(self):
        """Setup experiental memory unit"""
        logger.info("setting up replay buffer")
        # In the discrete actions case, we store the acs indices
        if isinstance(self.ac_space, spaces.Box):
            ac_shape_ = self.ac_shape
        elif isinstance(self.ac_space, spaces.Discrete):
            ac_shape_ = ()
        else:
            raise RuntimeError("ac space is neither Box nor Discrete")
        xp_params = {
            'limit': self.hps.mem_size,
            'ob_shape': self.ob_shape,
            'ac_shape': ac_shape_
        }
        extra_xp_params = {
            'alpha': self.hps.alpha,
            'beta': self.hps.beta,
            'ranked': self.hps.ranked
        }

        if self.hps.prioritized_replay:
            if self.hps.unreal:  # Unreal prioritized experience replay
                self.replay_buffer = XP.UnrealRB(**xp_params)
            else:  # Vanilla prioritized experience replay
                self.replay_buffer = XP.PrioritizedRB(**xp_params,
                                                      **extra_xp_params)
        else:  # Vanilla experience replay
            self.replay_buffer = XP.RB(**xp_params)
        # Summarize replay buffer creation (relies on `__repr__` method)
        logger.info("  {} configured".format(self.replay_buffer))
Exemple #2
0
    def setup_param_noise(self):
        """Setup two separate perturbed actors, one which be used only for interacting
        with the environment, while the other will be used exclusively for std adaption.
        We use two instead of one for clarity-related purposes.
        """
        # Define parameter corresponding to the current parameter noise stddev
        self.pn_cur_std = self.param_noise.cur_std  # real value, not the placeholder

        logger.info("setting up param noise")
        # Configure parameter-noise-perturbed ('pnp') actor
        # Use: interact with the environment
        self.pnp_actor_pred = self.clip_acs(self.pnp_actor(self.obz0))
        self.p_actor_updates = get_p_actor_updates(self.actor, self.pnp_actor,
                                                   self.pn_std)

        logger.info("setting up adaptive param noise")
        # Configure adaptive-parameter-noise-perturbed ('apnp') actor
        # Use: adapt the standard deviation
        self.apnp_actor_pred = self.clip_acs(self.apnp_actor(self.obz0))
        self.a_p_actor_updates = get_p_actor_updates(self.actor,
                                                     self.apnp_actor,
                                                     self.pn_std)
        self.a_dist = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_pred - self.apnp_actor_pred)))

        # Create callable objects
        # Act (and compute Q) according to the parameter-noise-perturbed actor
        self.p_act = TheanoFunction(inputs=[self.obs0],
                                    outputs=[self.pnp_actor_pred])
        self.p_act_q = TheanoFunction(
            inputs=[self.obs0],
            outputs=[self.pnp_actor_pred, self.critic_pred_w_actor])

        if isinstance(self.ac_space, spaces.Box):
            self.p_act = TheanoFunction(inputs=[self.obs0],
                                        outputs=[self.pnp_actor_pred])
            self.p_act_q = TheanoFunction(
                inputs=[self.obs0],
                outputs=[self.pnp_actor_pred, self.critic_pred_w_actor])
        elif isinstance(self.ac_space, spaces.Discrete):
            # Note: actor network outputs softmax -> take argmax to pick one action
            self.pnp_actor_pred_ = tf.argmax(self.pnp_actor_pred, axis=-1)
            self.p_act = TheanoFunction(inputs=[self.obs0],
                                        outputs=[self.pnp_actor_pred_])
            self.p_act_q = TheanoFunction(
                inputs=[self.obs0],
                outputs=[self.pnp_actor_pred_, self.critic_pred_w_actor])

        # Create distance between actor and adaptive-parameter-noise-perturbed actor predictions
        self.get_a_p_dist = TheanoFunction(inputs=[self.obs0, self.pn_std],
                                           outputs=self.a_dist)
        # Retrieve parameter-noise-perturbation updates
        self.apply_p_actor_updates = TheanoFunction(
            inputs=[self.pn_std], outputs=[self.p_actor_updates])
        # Retrieve adaptive-parameter-noise-perturbation updates
        self.apply_a_p_actor_updates = TheanoFunction(
            inputs=[self.pn_std], outputs=[self.a_p_actor_updates])
Exemple #3
0
 def close_video_recorder(self):
     """Close video recorder"""
     if self.recording:
         logger.info("saving video to:\n  {}".format(self.video_recorder.path))
         #  If recording, close the recorder
         self.video_recorder.close()
     # Reset running statistics
     self.recording = False
     self.num_recorded_frames = 1
Exemple #4
0
def get_benchmark(env_id):
    """Verify that the specified env is amongst the admissible ones"""
    envs = yaml.load(open("admissible_envs.yml"))['environments']
    benchmark = None
    for k, v in envs.items():
        if env_id in list(v.keys()):
            benchmark = k
    assert benchmark is not None, "env not found in 'project_root/admissible_envs.yml'"
    logger.info("env_id = {} <- admissibility check passed!".format(env_id))
    return benchmark
Exemple #5
0
    def setup_actor(self):
        logger.info("setting up actor optimizer")

        losses = OrderedDict()

        # Create the Q loss as the negative of the cumulated Q values
        q_loss = -tf.reduce_mean(self.critic_pred_w_actor)
        q_loss *= self.hps.q_actor_loss_scale

        # Create the actor loss w/ the scaled Q loss
        loss = q_loss

        losses.update({'actor_q_loss': q_loss})

        # Create the D loss as the negative of the cumulated D values
        d_loss = -tf.reduce_mean(self.d_pred_w_actor)
        d_loss *= self.hps.d_actor_loss_scale

        # Add the D loss to the actor loss
        loss += d_loss

        losses.update({'actor_d_loss': d_loss})

        # Add assembled actor loss
        losses.update({'actor_total_loss': loss})

        # Create gradients
        grads = flatgrad(loss, self.actor.trainable_vars, self.hps.clip_norm)

        # Create mpi adam optimizer
        optimizer = MpiAdamOptimizer(comm=self.comm,
                                     clip_norm=self.hps.clip_norm,
                                     learning_rate=self.hps.actor_lr,
                                     name='actor_adam')
        optimize_ = optimizer.minimize(loss=loss,
                                       var_list=self.actor.trainable_vars)

        # Create callable objects
        get_losses = TheanoFunction(inputs=[self.obs0],
                                    outputs=list(losses.values()))
        get_grads = TheanoFunction(inputs=[self.obs0], outputs=grads)
        optimize = TheanoFunction(inputs=[self.obs0], outputs=optimize_)

        # Log statistics
        log_module_info(logger, self.name, self.actor)

        # Return the actor ops
        return {
            'names': list(losses.keys()),
            'losses': get_losses,
            'grads': get_grads,
            'optimizer': optimizer,
            'optimize': optimize
        }
Exemple #6
0
 def add_demo_transitions_to_mem(self, dset):
     """Add transitions from expert demonstration trajectories to memory"""
     # Ensure the replay buffer is empty as demos need to be first
     assert self.num_entries == 0 and self.num_demos == 0
     logger.info("adding demonstrations to memory")
     # Zip transition atoms
     transitions = zipsame(dset.obs0, dset.acs, dset.env_rews, dset.obs1, dset.dones1)
     # Note: careful w/ the order, it should correspond to the order in `append` signature
     for transition in transitions:
         self.append(*transition, is_demo=True)
         self.num_demos += 1
     assert self.num_demos == self.num_entries
     logger.info("  num entries in memory after addition: {}".format(self.num_entries))
Exemple #7
0
    def __init__(self, expert_arxiv, size, train_fraction=None, randomize=True, full=False):
        """Create a dataset given the `expert_path` expert demonstration trajectories archive.
        Data structure of the archive in .npz format:
        the transitions are saved in python dictionary format with keys:
        'obs0', 'acs', 'rews', 'dones1', 'obs1', 'ep_rets',
        the values of each item is a list storing the expert trajectory sequentially.
        Note that 'ep_rets' is stored solely for monitoring purposes, and w/o 'ep_rets',
        a transition corrsponds exactly to the format of transitions stored in memory.
        """
        # Load the .npz archive file
        logger.info("loading expert demonstration trajectories from archive")
        traj_data = np.load(expert_arxiv)
        self.size = size
        assert 0 <= self.size <= len(traj_data['obs0']), "wrong demo dataset size"  # arbitrarily

        # Unpack
        #   1. Slice the desired quantity of trajectories
        #   2. Flatten the list of trajectories into a list of transitions
        #   Unpacking in done separately for each atom
        self.obs0 = np.array(flatten(traj_data['obs0'][:self.size]))
        self.acs = np.array(flatten(traj_data['acs'][:self.size]))
        if full:
            self.env_rews = np.array(flatten(traj_data['env_rews'][:self.size]))
            self.dones1 = np.array(flatten(traj_data['dones1'][:self.size]))
            self.obs1 = np.array(flatten(traj_data['obs1'][:self.size]))

        self.ep_rets = traj_data['ep_env_rets'][:self.size]
        self.ep_lens = traj_data['ep_lens'][:self.size]

        # Compute dataset statistics
        self.ret_mean = np.mean(np.array(self.ep_rets))
        self.ret_std = np.std(np.array(self.ep_rets))
        self.len_mean = np.mean(np.array(self.ep_lens))
        self.len_std = np.std(np.array(self.ep_lens))

        # Create (obs0,acs) dataset
        self.randomize = randomize
        self.pair_dset = PairDataset(self.obs0, self.acs, self.randomize)

        if train_fraction is not None:
            # Split dataset into train and test datasets (used in BC)
            t_t_frontier = int(self.size * train_fraction)
            self.pair_train_set = PairDataset(self.obs0[:t_t_frontier, :],
                                              self.acs[:t_t_frontier, :],
                                              self.randomize)
            self.pair_val_set = PairDataset(self.obs0[t_t_frontier:, :],
                                            self.acs[t_t_frontier:, :],
                                            self.randomize)

        # Log message upon successful trajectory dataset initialization
        self.log_info()
Exemple #8
0
 def parse_noise_type(self, noise_type):
     """Parse the `noise_type` hyperparameter"""
     ac_noise = None
     param_noise = None
     if isinstance(self.ac_space, spaces.Box):
         ac_dim = self.ac_space.shape[-1]  # num dims
     elif isinstance(self.ac_space, spaces.Discrete):
         ac_dim = self.ac_space.n  # num ac choices
     else:
         raise RuntimeError("ac space is neither Box nor Discrete")
     logger.info("parsing noise type")
     # Parse the comma-seprated (with possible whitespaces) list of noise params
     for cur_noise_type in noise_type.split(','):
         cur_noise_type = cur_noise_type.strip(
         )  # remove all whitespaces (start and end)
         # If the specified noise type is litterally 'none'
         if cur_noise_type == 'none':
             pass
         # If 'adaptive-param' is in the specified string for noise type
         elif 'adaptive-param' in cur_noise_type:
             # Set parameter noise
             from imitation.imitation_algorithms.param_noise import AdaptiveParamNoise
             if isinstance(self.ac_space, spaces.Box):
                 _, std = cur_noise_type.split('_')
                 std = float(std)
                 param_noise = AdaptiveParamNoise(initial_std=std,
                                                  delta=std)
             elif isinstance(self.ac_space, spaces.Discrete):
                 _, init_eps = cur_noise_type.split('_')
                 init_eps = float(init_eps)
                 # Compute param noise thres depending on eps, as explained in Appendix C.1
                 # of the paper 'Parameter Space Noise for Exploration', Plappert, ICLR 2017
                 init_delta = -np.log(1. - init_eps +
                                      (init_eps / float(ac_dim)))
                 param_noise = AdaptiveParamNoise(delta=init_delta)
                 self.setup_eps_greedy(init_eps)
             logger.info("  {} configured".format(param_noise))
         elif 'normal' in cur_noise_type:
             assert isinstance(self.ac_space,
                               spaces.Box), "must be continuous ac space"
             _, std = cur_noise_type.split('_')
             # Spherical (isotropic) gaussian action noise
             from imitation.imitation_algorithms.ac_noise import NormalAcNoise
             ac_noise = NormalAcNoise(mu=np.zeros(ac_dim),
                                      sigma=float(std) * np.ones(ac_dim))
             logger.info("  {} configured".format(ac_noise))
         elif 'ou' in cur_noise_type:
             assert isinstance(self.ac_space,
                               spaces.Box), "must be continuous ac space"
             _, std = cur_noise_type.split('_')
             # Ornstein-Uhlenbeck action noise
             from imitation.imitation_algorithms.ac_noise import OUAcNoise
             ac_noise = OUAcNoise(mu=np.zeros(ac_dim),
                                  sigma=float(std) * np.ones(ac_dim))
             logger.info("  {} configured".format(ac_noise))
         else:
             raise RuntimeError("unknown specified noise type: '{}'".format(
                 cur_noise_type))
     return param_noise, ac_noise
Exemple #9
0
    def setup_popart(self):
        """Play w/ the magnitude of the return @ the critic output
        by renormalizing the critic output vars (w + b) w/ old running statistics
        Reference paper: https://arxiv.org/pdf/1602.07714.pdf
        """
        logger.info("setting up popart")

        # Setting old and new stds and means
        self.old_std = tf.placeholder(name='old_std',
                                      dtype=tf.float32,
                                      shape=[1])
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(name='old_mean',
                                       dtype=tf.float32,
                                       shape=[1])
        new_mean = self.ret_rms.mean

        self.popart_op = []
        # Pass once in the critic and once in the target critic -> 2 loop steps
        for output_vars in [
                self.critic.output_vars, self.targ_critic.output_vars
        ]:
            # Ensure the network only has 2 vars w/ 'final' in their names (w + b of output layer)
            assert len(
                output_vars) == 2, "only w + b of the critic output layer \
                                           should be caught -> 2 vars"

            out_names = [var.name for var in output_vars]
            for out_name in out_names:
                # Log output variables on which popart involved in popart
                logger.info("  {}".format(out_name))
            # Unpack weight and bias of output layer
            w, b = output_vars
            # Ensure that w is indeed a weight, and that b is indeed a bias
            assert 'kernel' in w.name, "'w' not in w.name"
            assert 'bias' in b.name, "'b' not in b.name"
            # Ensure that both w and b are compatible w/ the critic spitting out a scalar
            assert w.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.popart_op += [w.assign(w * self.old_std / new_std)]
            self.popart_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

            # Create callable objects
            self.popart = TheanoFunction(inputs=[self.old_mean, self.old_std],
                                         outputs=[self.popart_op])
Exemple #10
0
    def setup_target_network_updates(self):
        logger.info("setting up target network updates")
        actor_args = [self.actor.vars, self.targ_actor.vars, self.hps.polyak]
        critic_args = [
            self.critic.vars, self.targ_critic.vars, self.hps.polyak
        ]
        actor_hard_updates, actor_soft_updates = get_target_updates(
            *actor_args)
        critic_hard_updates, critic_soft_updates = get_target_updates(
            *critic_args)
        self.targ_hard_updates = [actor_hard_updates, critic_hard_updates]
        self.targ_soft_updates = [actor_soft_updates, critic_soft_updates]

        # Create callable objects
        self.perform_targ_hard_updates = TheanoFunction(
            inputs=[], outputs=[self.targ_hard_updates])
        self.perform_targ_soft_updates = TheanoFunction(
            inputs=[], outputs=[self.targ_soft_updates])
Exemple #11
0
def evaluate(env,
             trpo_agent_wrapper,
             discriminator_wrapper,
             num_trajs,
             sample_or_mode,
             render,
             exact_model_path=None,
             model_ckpt_dir=None):
    """Evaluate a trained GAIL agent"""

    # Only one of the two arguments can be provided
    assert sum([exact_model_path is None, model_ckpt_dir is None]) == 1

    # Rebuild the computational graph to gain evaluation access to a learned and saved policy
    pi = trpo_agent_wrapper('pi')
    d = discriminator_wrapper('d')
    traj_gen = traj_ep_generator(env=env,
                                 pi=pi,
                                 d=d,
                                 sample_or_mode=sample_or_mode,
                                 render=render)
    # Initialize and load the previously learned weights into the freshly re-built graph
    initialize()
    if exact_model_path is not None:
        load_model(exact_model_path)
        logger.info(
            "model loaded from exact path:\n  {}".format(exact_model_path))
    else:  # `exact_model_path` is None -> `model_ckpt_dir` is not None
        load_latest_checkpoint(model_ckpt_dir)
        logger.info("model loaded from ckpt dir:\n  {}".format(model_ckpt_dir))
    # Initialize the history data structures
    ep_lens = []
    ep_syn_rets = []
    ep_env_rets = []
    # Collect trajectories
    for i in range(num_trajs):
        logger.info("evaluating [{}/{}]".format(i + 1, num_trajs))
        traj = traj_gen.__next__()
        ep_len, ep_syn_ret, ep_env_ret = traj['ep_len'], traj[
            'ep_syn_ret'], traj['ep_env_ret']
        # Aggregate to the history data structures
        ep_lens.append(ep_len)
        ep_syn_rets.append(ep_syn_ret)
        ep_env_rets.append(ep_env_ret)
    # Log some statistics of the collected trajectories
    sample_or_mode = 'sample' if sample_or_mode else 'mode'
    logger.info("action picking: {}".format(sample_or_mode))
    ep_len_mean = np.mean(ep_lens)
    ep_syn_ret_mean = np.mean(ep_syn_rets)
    ep_env_ret_mean = np.mean(ep_env_rets)
    logger.record_tabular("ep_len_mean", ep_len_mean)
    logger.record_tabular("ep_syn_ret_mean", ep_syn_ret_mean)
    logger.record_tabular("ep_env_ret_mean", ep_env_ret_mean)
    logger.dump_tabular()
Exemple #12
0
def get_target_updates(vars_, targ_vars, polyak):
    """Return assignment ops for target network updates.
    Hard updates are used for initialization only, while soft updates are
    used throughout the training process, at every iteration.
    Note that DQN uses hard updates while training, but those updates
    are not performed every iteration (only once every XX iterations).
    """
    logger.info("setting up target updates")
    hard_updates = []
    soft_updates = []
    assert len(vars_) == len(targ_vars)
    for var_, targ_var in zipsame(vars_, targ_vars):
        logger.info('  {} <- {}'.format(targ_var.name, var_.name))
        hard_updates.append(tf.assign(targ_var, var_))
        soft_updates.append(
            tf.assign(targ_var, (1. - polyak) * targ_var + polyak * var_))
    assert len(hard_updates) == len(vars_)
    assert len(soft_updates) == len(vars_)
    return tf.group(*hard_updates), tf.group(
        *soft_updates)  # ops that group ops
Exemple #13
0
def get_p_actor_updates(actor, perturbed_actor, pn_std):
    """Return assignment ops for actor parameters noise perturbations.
    The perturbations consist in applying additive gaussian noise the the perturbable
    actor variables, while simply leaving the non-perturbable ones untouched.
    """
    assert len(actor.vars) == len(perturbed_actor.vars)
    assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars)

    updates = []
    for var_, perturbed_var in zipsame(actor.vars, perturbed_actor.vars):
        if var_ in actor.perturbable_vars:
            logger.info("  {} <- {} + noise".format(perturbed_var.name,
                                                    var_.name))
            noised_up_var = var_ + tf.random_normal(
                tf.shape(var_), mean=0., stddev=pn_std)
            updates.append(tf.assign(perturbed_var, noised_up_var))
        else:
            logger.info("  {} <- {}".format(perturbed_var.name, var_.name))
            updates.append(tf.assign(perturbed_var, var_))
    assert len(updates) == len(actor.vars)
    return tf.group(*updates)
Exemple #14
0
def test_logger():
    info("hi")
    debug("shouldn't appear")
    set_level(DEBUG)
    debug("should appear")
    dir = "/tmp/testlogging"
    if osp.exists(dir):
        shutil.rmtree(dir)
    configure(dir_=dir)
    logkv("a", 3)
    logkv("b", 2.5)
    dumpkvs()
    logkv("b", -2.5)
    logkv("a", 5.5)
    dumpkvs()
    info("^^^ should see a = 5.5")

    logkv("b", -2.5)
    dumpkvs()

    logkv("a", "longasslongasslongasslongasslongasslongassvalue")
    dumpkvs()
Exemple #15
0
 def configure_logging(self):
     """Configure the experiment"""
     if self.comm is None or self.rank == 0:
         log_path = self.get_log_path()
         formats_strs = ['stdout', 'log', 'csv']
         fmtstr = "configuring logger"
         if self.comm is not None and self.rank == 0:
             fmtstr += " [master]"
         logger.info(fmtstr)
         logger.configure(dir_=log_path, format_strs=formats_strs)
         fmtstr = "logger configured"
         if self.comm is not None and self.rank == 0:
             fmtstr += " [master]"
         logger.info(fmtstr)
         logger.info("  directory: {}".format(log_path))
         logger.info("  output formats: {}".format(formats_strs))
         # In the same log folder, log args in yaml in yaml file
         file_logger = FileLogger(uuid=self.uuid,
                                  path=self.get_log_path(),
                                  file_prefix=self.name_prefix)
         file_logger.set_info('note', self.args.note)
         file_logger.set_info('uuid', self.uuid)
         file_logger.set_info('task', self.args.task)
         file_logger.set_info('args', str(self.args))
         fmtstr = "experiment configured"
         if self.comm is not None:
             fmtstr += " [{} MPI workers]".format(self.comm.Get_size())
         logger.info(fmtstr)
     else:
         logger.info("configuring logger [worker #{}]".format(self.rank))
         logger.configure(dir_=None, format_strs=None)
         logger.set_level(logger.DISABLED)
Exemple #16
0
 def log_info(self):
     logger.info("successfully initialized (obs0,acs) dataset, w/ statitics:")
     logger.info("  extracted num trajectories: {}".format(self.size))
     logger.info("  extracted num transitions: {}".format(len(self.obs0)))  # arbitrarily
     logger.info("  trajectory return mean: {}".format(self.ret_mean))
     logger.info("  trajectory return std: {}".format(self.ret_std))
     logger.info("  trajectory length mean: {}".format(self.len_mean))
     logger.info("  trajectory length std: {}".format(self.len_std))
Exemple #17
0
    def setup_critic(self):
        logger.info("setting up critic optimizer")

        losses = OrderedDict()

        phs = [self.obs0, self.acs]

        if self.hps.prioritized_replay:
            phs.append(self.iws)

        # Create the 1-step look-ahead TD error loss
        td_errors_1 = self.critic_pred - self.tc1z
        hubered_td_errors_1 = huber_loss(td_errors_1)
        if self.hps.prioritized_replay:
            # Adjust with importance weights
            hubered_td_errors_1 *= self.iws
        td_loss_1 = tf.reduce_mean(hubered_td_errors_1)
        td_loss_1 *= self.hps.td_loss_1_scale

        # Create the critic loss w/ the scaled 1-step TD loss
        loss = td_loss_1

        losses.update({'critic_td_loss_1': td_loss_1})

        phs.append(self.tc1s)

        if self.hps.n_step_returns:
            # Create the n-step look-ahead TD error loss
            td_errors_n = self.critic_pred - self.tcnz
            hubered_td_errors_n = huber_loss(td_errors_n)
            if self.hps.prioritized_replay:
                # Adjust with importance weights
                hubered_td_errors_n *= self.iws
            td_loss_n = tf.reduce_mean(hubered_td_errors_n)
            td_loss_n *= self.hps.td_loss_n_scale

            # Add the scaled n-step TD loss to the critic loss
            loss += td_loss_n

            losses.update({'critic_td_loss_n': td_loss_n})

            phs.append(self.tcns)

        # Fetch critic's regularization losses (@property of the network)
        wd_loss = tf.reduce_sum(self.critic.regularization_losses)
        # Note: no need to multiply by a scale as it has already been scaled
        logger.info("setting up weight decay")
        if self.hps.wd_scale > 0:
            for var in self.critic.trainable_vars:
                if var in self.critic.decayable_vars:
                    logger.info("  {} <- wd w/ scale {}".format(
                        var.name, self.hps.wd_scale))
                else:
                    logger.info("  {}".format(var.name))

        # Add critic weight decay regularization to the critic loss
        loss += wd_loss

        losses.update({'critic_wd': wd_loss})

        # Add assembled critic loss
        losses.update({'critic_total_loss': loss})

        # Create gradients
        grads = flatgrad(loss, self.critic.trainable_vars, self.hps.clip_norm)

        # Create mpi adam optimizer
        optimizer = MpiAdamOptimizer(comm=self.comm,
                                     clip_norm=self.hps.clip_norm,
                                     learning_rate=self.hps.critic_lr,
                                     name='critic_adam')
        optimize_ = optimizer.minimize(loss=loss,
                                       var_list=self.critic.trainable_vars)

        # Create callable objects
        get_losses = TheanoFunction(inputs=phs, outputs=list(losses.values()))
        get_grads = TheanoFunction(inputs=phs, outputs=grads)
        optimize = TheanoFunction(inputs=phs, outputs=optimize_)

        if self.hps.prioritized_replay:
            td_errors_ops = [td_errors_1] + ([td_errors_n] if
                                             self.hps.n_step_returns else [])
            get_td_errors = TheanoFunction(inputs=phs, outputs=td_errors_ops)

        # Log statistics
        log_module_info(logger, self.name, self.critic)

        # Return the critic ops
        out = {
            'names': list(losses.keys()),
            'losses': get_losses,
            'grads': get_grads,
            'optimizer': optimizer,
            'optimize': optimize
        }
        if self.hps.prioritized_replay:
            out.update({'td_errors': get_td_errors})

        return out
Exemple #18
0
def learn(comm,
          env,
          xpo_agent_wrapper,
          sample_or_mode,
          gamma,
          save_frequency,
          ckpt_dir,
          summary_dir,
          timesteps_per_batch,
          batch_size,
          optim_epochs_per_iter,
          lr,
          experiment_name,
          ent_reg_scale,
          clipping_eps,
          gae_lambda,
          schedule,
          max_iters):

    rank = comm.Get_rank()

    # Create policies
    pi = xpo_agent_wrapper('pi')
    old_pi = xpo_agent_wrapper('old_pi')

    # Create and retrieve already-existing placeholders
    ob = get_placeholder_cached(name='ob')
    ac = pi.pd_type.sample_placeholder([None])
    adv = tf.placeholder(name='adv', dtype=tf.float32, shape=[None])
    ret = tf.placeholder(name='ret', dtype=tf.float32, shape=[None])
    # Adaptive learning rate multiplier, updated with schedule
    lr_mult = tf.placeholder(name='lr_mult', dtype=tf.float32, shape=[])

    # Build graphs
    kl_mean = tf.reduce_mean(old_pi.pd_pred.kl(pi.pd_pred))
    ent_mean = tf.reduce_mean(pi.pd_pred.entropy())
    ent_pen = (-ent_reg_scale) * ent_mean
    vf_err = tf.reduce_mean(tf.square(pi.v_pred - ret))  # MC error
    # The surrogate objective is defined as: advantage * pnew / pold
    ratio = tf.exp(pi.pd_pred.logp(ac) - old_pi.pd_pred.logp(ac))  # IS
    surr_gain = ratio * adv  # surrogate objective (CPI)
    # Annealed clipping parameter epsilon
    clipping_eps = clipping_eps * lr_mult
    surr_gain_w_clipping = tf.clip_by_value(ratio,
                                            1.0 - clipping_eps,
                                            1.0 + clipping_eps) * adv
    # PPO's pessimistic surrogate (L^CLIP in paper)
    surr_loss = -tf.reduce_mean(tf.minimum(surr_gain, surr_gain_w_clipping))
    # Assemble losses (including the value function loss)
    loss = surr_loss + ent_pen + vf_err

    losses = OrderedDict()

    # Add losses
    losses.update({'pol_kl_mean': kl_mean,
                   'pol_ent_mean': ent_mean,
                   'pol_ent_pen': ent_pen,
                   'pol_surr_loss': surr_loss,
                   'pol_vf_err': vf_err,
                   'pol_total_loss': loss})

    # Make the current `pi` become the next `old_pi`
    zipped = zipsame(old_pi.vars, pi.vars)
    updates_op = []
    for k, v in zipped:
        # Populate list of assignment operations
        logger.info("assignment: {} <- {}".format(k, v))
        assign_op = tf.assign(k, v)
        updates_op.append(assign_op)
    assert len(updates_op) == len(pi.vars)

    # Create mpi adam optimizer
    optimizer = MpiAdamOptimizer(comm=comm,
                                 clip_norm=5.0,
                                 learning_rate=lr * lr_mult,
                                 name='adam')
    optimize = optimizer.minimize(loss=loss, var_list=pi.trainable_vars)

    # Create callable objects
    assign_old_eq_new = TheanoFunction(inputs=[], outputs=updates_op)
    compute_losses = TheanoFunction(inputs=[ob, ac, adv, ret, lr_mult],
                                    outputs=list(losses.values()))
    optimize = TheanoFunction(inputs=[ob, ac, adv, ret, lr_mult],
                              outputs=optimize)

    # Initialise variables
    initialize()

    # Sync params of all processes with the params of the root process
    optimizer.sync_from_root(pi.trainable_vars)

    # Create context manager that records the time taken by encapsulated ops
    timed = timed_cm_wrapper(comm, logger)

    if rank == 0:
        # Create summary writer
        summary_writer = tf.summary.FileWriterCache.get(summary_dir)

    seg_gen = traj_segment_generator(env, pi, timesteps_per_batch, sample_or_mode)

    eps_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()

    # Define rolling buffers for recent stats aggregation
    maxlen = 100
    len_buffer = deque(maxlen=maxlen)
    env_ret_buffer = deque(maxlen=maxlen)
    pol_losses_buffer = deque(maxlen=maxlen)

    while iters_so_far <= max_iters:

        pretty_iter(logger, iters_so_far)
        pretty_elapsed(logger, tstart)

        # Verify that the processes are still in sync
        if iters_so_far > 0 and iters_so_far % 10 == 0:
            optimizer.check_synced(pi.trainable_vars)
            logger.info("params still in sync across processes")

        # Manage lr multiplier schedule
        if schedule == 'constant':
            curr_lr_mult = 1.0
        elif schedule == 'linear':
            curr_lr_mult = max(1.0 - float(iters_so_far * timesteps_per_batch) /
                               max_iters * timesteps_per_batch, 0)
        else:
            raise NotImplementedError

        # Save the model
        if rank == 0 and iters_so_far % save_frequency == 0 and ckpt_dir is not None:
            model_path = osp.join(ckpt_dir, experiment_name)
            save_state(model_path, iters_so_far=iters_so_far)
            logger.info("saving model")
            logger.info("  @: {}".format(model_path))

        with timed("sampling mini-batch"):
            seg = seg_gen.__next__()

        augment_segment_gae_stats(seg, gamma, gae_lambda, rew_key="env_rews")

        # Standardize advantage function estimate
        seg['advs'] = (seg['advs'] - seg['advs'].mean()) / (seg['advs'].std() + 1e-8)

        # Update running mean and std
        if hasattr(pi, 'obs_rms'):
            with timed("normalizing obs via rms"):
                pi.obs_rms.update(seg['obs'], comm)

        assign_old_eq_new({})

        # Create Feeder object to iterate over (ob, ac, adv, td_lam_ret) tuples
        data_map = {'obs': seg['obs'],
                    'acs': seg['acs'],
                    'advs': seg['advs'],
                    'td_lam_rets': seg['td_lam_rets']}
        feeder = Feeder(data_map=data_map, enable_shuffle=True)

        # Update policy and state-value function
        with timed("updating policy and value function"):
            for _ in range(optim_epochs_per_iter):
                for minibatch in feeder.get_feed(batch_size=batch_size):

                    feeds = {ob: minibatch['obs'],
                             ac: minibatch['acs'],
                             adv: minibatch['advs'],
                             ret: minibatch['td_lam_rets'],
                             lr_mult: curr_lr_mult}

                    # Compute losses
                    pol_losses = compute_losses(feeds)

                    # Update the policy and value function
                    optimize(feeds)

                    # Store the losses
                    pol_losses_buffer.append(pol_losses)

        # Log policy update statistics
        logger.info("logging training losses (log)")
        pol_losses_np_mean = np.mean(pol_losses_buffer, axis=0)
        pol_losses_mpi_mean = mpi_mean_reduce(pol_losses_buffer, comm, axis=0)
        zipped_pol_losses = zipsame(list(losses.keys()), pol_losses_np_mean, pol_losses_mpi_mean)
        logger.info(columnize(names=['name', 'local', 'global'],
                              tuples=zipped_pol_losses,
                              widths=[20, 16, 16]))

        # Log statistics

        logger.info("logging misc training stats (log + csv)")
        # Gather statistics across workers
        local_lens_rets = (seg['ep_lens'], seg['ep_env_rets'])
        gathered_lens_rets = comm.allgather(local_lens_rets)
        lens, env_rets = map(flatten_lists, zip(*gathered_lens_rets))
        # Extend the deques of recorded statistics
        len_buffer.extend(lens)
        env_ret_buffer.extend(env_rets)
        ep_len_mpi_mean = np.mean(len_buffer)
        ep_env_ret_mpi_mean = np.mean(env_ret_buffer)
        logger.record_tabular('ep_len_mpi_mean', ep_len_mpi_mean)
        logger.record_tabular('ep_env_ret_mpi_mean', ep_env_ret_mpi_mean)
        eps_this_iter = len(lens)
        timesteps_this_iter = sum(lens)
        eps_so_far += eps_this_iter
        timesteps_so_far += timesteps_this_iter
        eps_this_iter_mpi_mean = mpi_mean_like(eps_this_iter, comm)
        timesteps_this_iter_mpi_mean = mpi_mean_like(timesteps_this_iter, comm)
        eps_so_far_mpi_mean = mpi_mean_like(eps_so_far, comm)
        timesteps_so_far_mpi_mean = mpi_mean_like(timesteps_so_far, comm)
        logger.record_tabular('eps_this_iter_mpi_mean', eps_this_iter_mpi_mean)
        logger.record_tabular('timesteps_this_iter_mpi_mean', timesteps_this_iter_mpi_mean)
        logger.record_tabular('eps_so_far_mpi_mean', eps_so_far_mpi_mean)
        logger.record_tabular('timesteps_so_far_mpi_mean', timesteps_so_far_mpi_mean)
        logger.record_tabular('elapsed time', prettify_time(time.time() - tstart))  # no mpi mean
        logger.record_tabular('ev_td_lam_before', explained_variance(seg['vs'],
                                                                     seg['td_lam_rets']))
        iters_so_far += 1

        if rank == 0:
            logger.dump_tabular()

        if rank == 0:
            # Add summaries
            summary = tf.summary.Summary()
            tab = 'ppo'
            # Episode stats
            summary.value.add(tag="{}/{}".format(tab, 'mean_ep_len'),
                              simple_value=ep_len_mpi_mean)
            summary.value.add(tag="{}/{}".format(tab, 'mean_ep_env_ret'),
                              simple_value=ep_env_ret_mpi_mean)
            # Losses
            for name, loss in zipsame(list(losses.keys()), pol_losses_mpi_mean):
                summary.value.add(tag="{}/{}".format(tab, name), simple_value=loss)

            summary_writer.add_summary(summary, iters_so_far)
Exemple #19
0
def learn(comm, env, xpo_agent_wrapper, sample_or_mode, gamma, max_kl,
          save_frequency, ckpt_dir, summary_dir, timesteps_per_batch,
          batch_size, experiment_name, ent_reg_scale, gae_lambda, cg_iters,
          cg_damping, vf_iters, vf_lr, max_iters):

    rank = comm.Get_rank()

    # Create policies
    pi = xpo_agent_wrapper('pi')
    old_pi = xpo_agent_wrapper('old_pi')

    # Create and retrieve already-existing placeholders
    ob = get_placeholder_cached(name='ob')
    ac = pi.pd_type.sample_placeholder([None])
    adv = tf.placeholder(name='adv', dtype=tf.float32, shape=[None])
    ret = tf.placeholder(name='ret', dtype=tf.float32, shape=[None])
    flat_tangent = tf.placeholder(name='flat_tan',
                                  dtype=tf.float32,
                                  shape=[None])

    # Build graphs
    kl_mean = tf.reduce_mean(old_pi.pd_pred.kl(pi.pd_pred))
    ent_mean = tf.reduce_mean(pi.pd_pred.entropy())
    ent_bonus = ent_reg_scale * ent_mean
    vf_err = tf.reduce_mean(tf.square(pi.v_pred - ret))  # MC error
    # The surrogate objective is defined as: advantage * pnew / pold
    ratio = tf.exp(pi.pd_pred.logp(ac) - old_pi.pd_pred.logp(ac))  # IS
    surr_gain = tf.reduce_mean(ratio * adv)  # surrogate objective (CPI)
    # Add entropy bonus
    optim_gain = surr_gain + ent_bonus

    losses = OrderedDict()

    # Add losses
    losses.update({
        'pol_kl_mean': kl_mean,
        'pol_ent_mean': ent_mean,
        'pol_ent_bonus': ent_bonus,
        'pol_surr_gain': surr_gain,
        'pol_optim_gain': optim_gain,
        'pol_vf_err': vf_err
    })

    # Build natural gradient material
    get_flat = GetFlat(pi.pol_trainable_vars)
    set_from_flat = SetFromFlat(pi.pol_trainable_vars)
    kl_grads = tf.gradients(kl_mean, pi.pol_trainable_vars)
    shapes = [var.get_shape().as_list() for var in pi.pol_trainable_vars]
    start = 0
    tangents = []
    for shape in shapes:
        sz = intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    # Create the gradient vector product
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(kl_grads, tangents)
    ])
    # Create the Fisher vector product
    fvp = flatgrad(gvp, pi.pol_trainable_vars)

    # Make the current `pi` become the next `old_pi`
    zipped = zipsame(old_pi.vars, pi.vars)
    updates_op = []
    for k, v in zipped:
        # Populate list of assignment operations
        logger.info("assignment: {} <- {}".format(k, v))
        assign_op = tf.assign(k, v)
        updates_op.append(assign_op)
    assert len(updates_op) == len(pi.vars)

    # Create mpi adam optimizer for the value function
    vf_optimizer = MpiAdamOptimizer(comm=comm,
                                    clip_norm=5.0,
                                    learning_rate=vf_lr,
                                    name='vf_adam')
    optimize_vf = vf_optimizer.minimize(loss=vf_err,
                                        var_list=pi.vf_trainable_vars)

    # Create gradients
    grads = flatgrad(optim_gain, pi.pol_trainable_vars)

    # Create callable objects
    assign_old_eq_new = TheanoFunction(inputs=[], outputs=updates_op)
    compute_losses = TheanoFunction(inputs=[ob, ac, adv, ret],
                                    outputs=list(losses.values()))
    compute_losses_grads = TheanoFunction(inputs=[ob, ac, adv, ret],
                                          outputs=list(losses.values()) +
                                          [grads])
    compute_fvp = TheanoFunction(inputs=[flat_tangent, ob, ac, adv],
                                 outputs=fvp)
    optimize_vf = TheanoFunction(inputs=[ob, ret], outputs=optimize_vf)

    # Initialise variables
    initialize()

    # Sync params of all processes with the params of the root process
    theta_init = get_flat()
    comm.Bcast(theta_init, root=0)
    set_from_flat(theta_init)

    vf_optimizer.sync_from_root(pi.vf_trainable_vars)

    # Create context manager that records the time taken by encapsulated ops
    timed = timed_cm_wrapper(comm, logger)

    if rank == 0:
        # Create summary writer
        summary_writer = tf.summary.FileWriterCache.get(summary_dir)

    # Create segment generator
    seg_gen = traj_segment_generator(env, pi, timesteps_per_batch,
                                     sample_or_mode)

    eps_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()

    # Define rolling buffers for recent stats aggregation
    maxlen = 100
    len_buffer = deque(maxlen=maxlen)
    env_ret_buffer = deque(maxlen=maxlen)
    pol_losses_buffer = deque(maxlen=maxlen)

    while iters_so_far <= max_iters:

        pretty_iter(logger, iters_so_far)
        pretty_elapsed(logger, tstart)

        # Verify that the processes are still in sync
        if iters_so_far > 0 and iters_so_far % 10 == 0:
            vf_optimizer.check_synced(pi.vf_trainable_vars)
            logger.info("vf params still in sync across processes")

        # Save the model
        if rank == 0 and iters_so_far % save_frequency == 0 and ckpt_dir is not None:
            model_path = osp.join(ckpt_dir, experiment_name)
            save_state(model_path, iters_so_far=iters_so_far)
            logger.info("saving model")
            logger.info("  @: {}".format(model_path))

        with timed("sampling mini-batch"):
            seg = seg_gen.__next__()

        augment_segment_gae_stats(seg, gamma, gae_lambda, rew_key="env_rews")

        # Standardize advantage function estimate
        seg['advs'] = (seg['advs'] - seg['advs'].mean()) / (seg['advs'].std() +
                                                            1e-8)

        # Update running mean and std
        if hasattr(pi, 'obs_rms'):
            with timed("normalizing obs via rms"):
                pi.obs_rms.update(seg['obs'], comm)

        def fisher_vector_product(p):
            computed_fvp = compute_fvp({
                flat_tangent: p,
                ob: seg['obs'],
                ac: seg['acs'],
                adv: seg['advs']
            })
            return mpi_mean_like(computed_fvp, comm) + cg_damping * p

        assign_old_eq_new({})

        # Compute gradients
        with timed("computing gradients"):
            *loss_before, g = compute_losses_grads({
                ob: seg['obs'],
                ac: seg['acs'],
                adv: seg['advs'],
                ret: seg['td_lam_rets']
            })

        loss_before = mpi_mean_like(loss_before, comm)

        g = mpi_mean_like(g, comm)

        if np.allclose(g, 0):
            logger.info("got zero gradient -> not updating")
        else:
            with timed("performing conjugate gradient procedure"):
                step_direction = conjugate_gradient(f_Ax=fisher_vector_product,
                                                    b=g,
                                                    cg_iters=cg_iters,
                                                    verbose=(rank == 0))
            assert np.isfinite(step_direction).all()
            shs = 0.5 * step_direction.dot(
                fisher_vector_product(step_direction))
            # shs is (1/2)*s^T*A*s in the paper
            lm = np.sqrt(shs / max_kl)
            # lm is 1/beta in the paper (max_kl is user-specified delta)
            full_step = step_direction / lm  # beta*s
            expected_improve = g.dot(full_step)  # project s on g
            surr_before = loss_before[4]  # 5-th in loss list
            step_size = 1.0
            theta_before = get_flat()

            with timed("updating policy"):
                for _ in range(
                        10):  # trying (10 times max) until the stepsize is OK
                    # Update the policy parameters
                    theta_new = theta_before + full_step * step_size
                    set_from_flat(theta_new)
                    pol_losses = compute_losses({
                        ob: seg['obs'],
                        ac: seg['acs'],
                        adv: seg['advs'],
                        ret: seg['td_lam_rets']
                    })

                    pol_losses_buffer.append(pol_losses)

                    pol_losses_mpi_mean = mpi_mean_like(pol_losses, comm)
                    surr = pol_losses_mpi_mean[4]
                    kl = pol_losses_mpi_mean[0]
                    actual_improve = surr - surr_before
                    logger.info("  expected: {:.3f} | actual: {:.3f}".format(
                        expected_improve, actual_improve))
                    if not np.isfinite(pol_losses_mpi_mean).all():
                        logger.info("  got non-finite value of losses :(")
                    elif kl > max_kl * 1.5:
                        logger.info(
                            "  violated KL constraint -> shrinking step.")
                    elif actual_improve < 0:
                        logger.info(
                            "  surrogate didn't improve -> shrinking step.")
                    else:
                        logger.info("  stepsize fine :)")
                        break
                    step_size *= 0.5  # backtracking when the step size is deemed inappropriate
                else:
                    logger.info("  couldn't compute a good step")
                    set_from_flat(theta_before)

        # Create Feeder object to iterate over (ob, ret) pairs
        feeder = Feeder(data_map={
            'obs': seg['obs'],
            'td_lam_rets': seg['td_lam_rets']
        },
                        enable_shuffle=True)

        # Update state-value function
        with timed("updating value function"):
            for _ in range(vf_iters):
                for minibatch in feeder.get_feed(batch_size=batch_size):
                    optimize_vf({
                        ob: minibatch['obs'],
                        ret: minibatch['td_lam_rets']
                    })

        # Log policy update statistics
        logger.info("logging pol training losses (log)")
        pol_losses_np_mean = np.mean(pol_losses_buffer, axis=0)
        pol_losses_mpi_mean = mpi_mean_reduce(pol_losses_buffer, comm, axis=0)
        zipped_pol_losses = zipsame(list(losses.keys()), pol_losses_np_mean,
                                    pol_losses_mpi_mean)
        logger.info(
            columnize(names=['name', 'local', 'global'],
                      tuples=zipped_pol_losses,
                      widths=[20, 16, 16]))

        # Log statistics

        logger.info("logging misc training stats (log + csv)")
        # Gather statistics across workers
        local_lens_rets = (seg['ep_lens'], seg['ep_env_rets'])
        gathered_lens_rets = comm.allgather(local_lens_rets)
        lens, env_rets = map(flatten_lists, zip(*gathered_lens_rets))
        # Extend the deques of recorded statistics
        len_buffer.extend(lens)
        env_ret_buffer.extend(env_rets)
        ep_len_mpi_mean = np.mean(len_buffer)
        ep_env_ret_mpi_mean = np.mean(env_ret_buffer)
        logger.record_tabular('ep_len_mpi_mean', ep_len_mpi_mean)
        logger.record_tabular('ep_env_ret_mpi_mean', ep_env_ret_mpi_mean)
        eps_this_iter = len(lens)
        timesteps_this_iter = sum(lens)
        eps_so_far += eps_this_iter
        timesteps_so_far += timesteps_this_iter
        eps_this_iter_mpi_mean = mpi_mean_like(eps_this_iter, comm)
        timesteps_this_iter_mpi_mean = mpi_mean_like(timesteps_this_iter, comm)
        eps_so_far_mpi_mean = mpi_mean_like(eps_so_far, comm)
        timesteps_so_far_mpi_mean = mpi_mean_like(timesteps_so_far, comm)
        logger.record_tabular('eps_this_iter_mpi_mean', eps_this_iter_mpi_mean)
        logger.record_tabular('timesteps_this_iter_mpi_mean',
                              timesteps_this_iter_mpi_mean)
        logger.record_tabular('eps_so_far_mpi_mean', eps_so_far_mpi_mean)
        logger.record_tabular('timesteps_so_far_mpi_mean',
                              timesteps_so_far_mpi_mean)
        logger.record_tabular('elapsed time',
                              prettify_time(time.time() -
                                            tstart))  # no mpi mean
        logger.record_tabular(
            'ev_td_lam_before',
            explained_variance(seg['vs'], seg['td_lam_rets']))
        iters_so_far += 1

        if rank == 0:
            logger.dump_tabular()

        if rank == 0:
            # Add summaries
            summary = tf.summary.Summary()
            tab = 'trpo'
            # Episode stats
            summary.value.add(tag="{}/{}".format(tab, 'mean_ep_len'),
                              simple_value=ep_len_mpi_mean)
            summary.value.add(tag="{}/{}".format(tab, 'mean_ep_env_ret'),
                              simple_value=ep_env_ret_mpi_mean)
            # Losses
            for name, loss in zipsame(list(losses.keys()),
                                      pol_losses_mpi_mean):
                summary.value.add(tag="{}/{}".format(tab, name),
                                  simple_value=loss)

            summary_writer.add_summary(summary, iters_so_far)
Exemple #20
0
def gather_trajectories(env,
                        xpo_agent_wrapper,
                        demos_dir,
                        num_trajs,
                        sample_or_mode,
                        render,
                        expert_arxiv_name,
                        exact_model_path=None,
                        model_ckpt_dir=None):
    """Gather trajectories from a trained `mlp_policy` agent"""
    # Only one of the two arguments can be provided
    assert sum([exact_model_path is None, model_ckpt_dir is None]) == 1

    # Rebuild the computational graph to gain evaluation access to a learned and saved policy
    pi = xpo_agent_wrapper('pi')
    # Create episode generator
    traj_gen = traj_ep_generator(env=env,
                                 pi=pi,
                                 sample_or_mode=sample_or_mode,
                                 render=render)
    # Initialize and load the previously learned weights into the freshly re-built graph
    initialize()
    if exact_model_path is not None:
        load_model(exact_model_path)
        logger.info(
            "model loaded from exact path:\n  {}".format(exact_model_path))
    else:  # `exact_model_path` is None -> `model_ckpt_dir` is not None
        load_latest_checkpoint(model_ckpt_dir)
        logger.info("model loaded from ckpt dir:\n  {}".format(model_ckpt_dir))
    # Initialize the history data structures
    obs0 = []
    acs = []
    env_rews = []
    dones1 = []
    obs1 = []
    ep_env_rets = []
    ep_lens = []
    # Collect trajectories
    for i in range(num_trajs):
        logger.info("gathering [{}/{}]".format(i + 1, num_trajs))
        traj = traj_gen.__next__()
        # Next two steps are separated to shrink line length
        ep_obs0, ep_acs, ep_env_rews = traj['obs0'], traj['acs'], traj[
            'env_rews']
        ep_dones1, ep_obs1 = traj['dones1'], traj['obs1']
        ep_len, ep_env_ret = traj['ep_len'], traj['ep_env_ret']
        # Aggregate to the history data structures
        obs0.append(ep_obs0)
        acs.append(ep_acs)
        env_rews.append(ep_env_rews)
        dones1.append(ep_dones1)
        obs1.append(ep_obs1)
        ep_lens.append(ep_len)
        ep_env_rets.append(ep_env_ret)
    # Log some statistics of the collected trajectories
    sample_or_mode = 'sample' if sample_or_mode else 'mode'
    logger.info("action picking: {}".format(sample_or_mode))
    ep_len_mean = np.mean(ep_lens)
    ep_len_std = np.std(ep_lens)
    ep_env_ret_mean = np.mean(ep_env_rets)
    ep_env_ret_std = np.std(ep_env_rets)
    ep_env_ret_min = np.amin(ep_env_rets)
    ep_env_ret_max = np.amax(ep_env_rets)
    logger.record_tabular("ep_len_mean", ep_len_mean)
    logger.record_tabular("ep_len_std", ep_len_std)
    logger.record_tabular("ep_env_ret_mean", ep_env_ret_mean)
    logger.record_tabular("ep_env_ret_std", ep_env_ret_std)
    logger.record_tabular("ep_env_ret_min", ep_env_ret_min)
    logger.record_tabular("ep_env_ret_max", ep_env_ret_max)
    logger.dump_tabular()
    # Assemble the file name
    path = osp.join(demos_dir, "{}.{}".format(expert_arxiv_name,
                                              sample_or_mode))
    # Save the gathered data collections to the filesystem
    np.savez(path,
             obs0=np.array(obs0),
             acs=np.array(acs),
             env_rews=np.array(env_rews),
             dones1=np.array(dones1),
             obs1=np.array(obs1),
             ep_lens=np.array(ep_lens),
             ep_env_rets=np.array(ep_env_rets))
    logger.info("saving demonstrations")
    logger.info("  @: {}.npz".format(path))