Beispiel #1
0
    def dump_tabular(self):
        """
        Write all of the diagnostics from the current iteration.

        Writes both to stdout, and to the output file.
        """
        if proc_id() == 0:
            vals = []
            key_lens = [len(key) for key in self.log_headers]
            max_key_len = max(15, max(key_lens))
            keystr = '%' + '%d' % max_key_len
            fmt = "| " + keystr + "s | %15s |"
            n_slashes = 22 + max_key_len
            print("-" * n_slashes)
            for key in self.log_headers:
                val = self.log_current_row.get(key, "")
                valstr = "%8.3g" % val if hasattr(val, "__float__") else val
                print(fmt % (key, valstr))
                vals.append(val)
            print("-" * n_slashes)
            if self.output_file is not None:
                if self.first_row:
                    self.output_file.write("\t".join(self.log_headers) + "\n")
                self.output_file.write("\t".join(map(str, vals)) + "\n")
                self.output_file.flush()
        self.log_current_row.clear()
        self.first_row = False
Beispiel #2
0
    def save_state(self, state_dict, model, itr=None):
        """
        Saves the state of an experiment.

        To be clear: this is about saving *state*, not logging diagnostics.
        All diagnostic logging is separate from this function. This function
        will save whatever is in ``state_dict``---usually just a copy of the
        environment---and the most recent model parameters``. 

        Call with any frequency you prefer. If you only want to maintain a
        single state and overwrite it at each call with the most recent 
        version, leave ``itr=None``. If you want to keep all of the states you
        save, provide unique (increasing) values for 'itr'.

        Args:
            state_dict (dict): Dictionary containing essential elements to
                describe the current state of training.

            itr: An int, or None. Current iteration of training.
        """
        if proc_id() == 0:
            fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr
            try:
                joblib.dump(state_dict, osp.join(self.output_dir, fname))
            except:
                self.log('Warning: could not pickle state_dict.', color='red')
            self.torch_simple_save(model, itr)
Beispiel #3
0
    def save_config(self, config):
        """
        Log an experiment configuration.

        Call this once at the top of your experiment, passing in all important
        config vars as a dict. This will serialize the config to JSON, while
        handling anything which can't be serialized in a graceful way (writing
        as informative a string as possible). 

        Example use:

        .. code-block:: python

            logger = EpochLogger(**logger_kwargs)
            logger.save_config(locals())
        """
        config_json = convert_json(config)
        if self.exp_name is not None:
            config_json['exp_name'] = self.exp_name
        if proc_id() == 0:
            output = json.dumps(config_json,
                                separators=(',', ':\t'),
                                indent=4,
                                sort_keys=True)
            print(colorize('Saving config:\n', color='cyan', bold=True))
            print(output)
            with open(osp.join(self.output_dir, "config.json"), 'w') as out:
                out.write(output)
Beispiel #4
0
 def torch_simple_save(self, model, itr=None):
     """
     Uses simple_save to save a trained model
     """
     if proc_id() == 0:
         fpath = 'simple_save' + ('%d.pth' %
                                  itr if itr is not None else '.pth')
         fpath = osp.join(self.output_dir, fpath)
         torch.save(model, fpath)
Beispiel #5
0
    def __init__(self,
                 output_dir=None,
                 output_fname='progress.txt',
                 exp_name=None):
        """
        Initialize a Logger.

        Args:
            output_dir (string): A directory for saving results to. If 
                ``None``, defaults to a temp directory of the form
                ``/tmp/experiments/somerandomnumber``.

            output_fname (string): Name for the tab-separated-value file 
                containing metrics logged throughout a training run. 
                Defaults to ``progress.txt``. 

            exp_name (string): Experiment name. If you run multiple training
                runs and give them all the same ``exp_name``, the plotter
                will know to group them. (Use case: if you run the same
                hyperparameter configuration with multiple random seeds, you
                should give them all the same ``exp_name``.)
        """
        if proc_id() == 0:
            self.output_dir = output_dir or "/tmp/experiments/%i" % int(
                time.time())
            if osp.exists(self.output_dir):
                print(
                    "Warning: Log dir %s already exists! Storing info there anyway."
                    % self.output_dir)
            else:
                os.makedirs(self.output_dir)
            self.output_file = open(osp.join(self.output_dir, output_fname),
                                    'w')
            atexit.register(self.output_file.close)
            print(
                colorize("Logging data to %s" % self.output_file.name,
                         'green',
                         bold=True))
        else:
            self.output_dir = None
            self.output_file = None
        self.first_row = True
        self.log_headers = []
        self.log_current_row = {}
        self.exp_name = exp_name
Beispiel #6
0
 def log(self, msg, color='green'):
     """Print a colorized message to stdout."""
     if proc_id() == 0:
         print(colorize(msg, color, bold=True))
Beispiel #7
0
def trpo(env_fn,
         actor_critic=core.Actor_Critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         delta=0.01,
         vf_lr=1e-3,
         train_v_iters=80,
         damping_coeff=0.1,
         cg_iters=10,
         backtrack_iters=10,
         backtrack_coeff=0.8,
         lam=0.97,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=10,
         algo='trpo'):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ============  ================  ========================================
            Symbol        Shape             Description
            ============  ================  ========================================
            ``pi``        (batch, act_dim)  | Samples actions from policy given 
                                            | states.
            ``logp``      (batch,)          | Gives log probability, according to
                                            | the policy, of taking actions ``a_ph``
                                            | in states ``x_ph``.
            ``logp_pi``   (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``.
            ``info``      N/A               | A dict of any intermediate quantities
                                            | (from calculating the policy or log 
                                            | probabilities) which are needed for
                                            | analytically computing KL divergence.
                                            | (eg sufficient statistics of the
                                            | distributions)
            ``info_phs``  N/A               | A dict of placeholders for old values
                                            | of the entries in ``info``.
            ``d_kl``      ()                | A symbol for computing the mean KL
                                            | divergence between the current policy
                                            | (``pi``) and the old policy (as 
                                            | specified by the inputs to 
                                            | ``info_phs``) over the batch of 
                                            | states given in ``x_ph``.
            ``v``         (batch,)          | Gives the value estimate for states
                                            | in ``x_ph``. (Critical: make sure 
                                            | to flatten this!)
            ============  ================  ========================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TRPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        delta (float): KL-divergence limit for TRPO / NPG update. 
            (Should be small for stability. Values like 0.01, 0.05.)

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        damping_coeff (float): Artifact for numerical stability, should be 
            smallish. Adjusts Hessian-vector product calculation:
            
            .. math:: Hv \\rightarrow (\\alpha I + H)v

            where :math:`\\alpha` is the damping coefficient. 
            Probably don't play with this hyperparameter.

        cg_iters (int): Number of iterations of conjugate gradient to perform. 
            Increasing this will lead to a more accurate approximation
            to :math:`H^{-1} g`, and possibly slightly-improved performance,
            but at the cost of slowing things down. 

            Also probably don't play with this hyperparameter.

        backtrack_iters (int): Maximum number of steps allowed in the 
            backtracking line search. Since the line search usually doesn't 
            backtrack, and usually only steps back once when it does, this
            hyperparameter doesn't often matter.

        backtrack_coeff (float): How far back to step during backtracking line
            search. (Always between 0 and 1, usually above 0.5.)

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        algo: Either 'trpo' or 'npg': this code supports both, since they are 
            almost the same.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    if isinstance(env.action_space, Discrete):
        info_shapes = {'logp_all': [env.action_space.n]}
    else:
        info_shapes = {'mu': [env.action_space.shape[0]]}
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes,
                    gamma, lam)

    # make core of policy network
    net = actor_critic(obs_dim[0], **ac_kwargs)
    print(net)

    # loss function
    criterion_mse = nn.MSELoss()

    # optim
    optimizer_critic = optim.Adam(net.critic.parameters(), lr=vf_lr)

    # Sync params across processes
    sync_all_params(net.parameters())

    # Count variables
    # var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    # logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """
        x = np.zeros_like(b)
        r = b.copy(
        )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = r.copy()
        r_dot_old = np.dot(r, r)
        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (np.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        net.train()
        inputs = [torch.from_numpy(x) for x in buf.get()]

        # Main outputs from computation graph, plus placeholders for old pdist (for KL)
        x_ph, a_ph, adv_ph, ret_ph, logp_old_ph = inputs[:5]
        _, logp, _, _, d_kl = net.apply_actor(x_ph,
                                              a_ph,
                                              old_logp_or_mu=inputs[-1])
        v = net.apply_critic(x_ph)

        # TRPO losses
        ratio = torch.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
        pi_l_old = -torch.mean(ratio * adv_ph)
        v_l_old = criterion_mse(v, ret_ph)

        # Prepare hessian func, gradient eval
        g = core.flat_grad(pi_l_old, net.actor.parameters(), retain_graph=True)
        g = mpi_avg(g.numpy())
        pi_l_old = mpi_avg(pi_l_old.item())

        def Hx(x):
            x = torch.from_numpy(x)
            hvp = core.hessian_vector_product(d_kl, net.actor, x)
            if damping_coeff > 0:
                hvp += damping_coeff * x
            return mpi_avg(hvp.detach().numpy())

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))
        old_params = parameters_to_vector(net.actor.parameters())

        x = torch.from_numpy(x)

        def set_and_eval(step):
            vector_to_parameters(old_params - alpha * x * step,
                                 net.actor.parameters())
            _, logp, _, _, d_kl = net.apply_actor(x_ph,
                                                  a_ph,
                                                  old_logp_or_mu=inputs[-1])
            ratio = torch.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
            pi_loss = -torch.mean(ratio * adv_ph)
            return mpi_avg(d_kl.item()), mpi_avg(pi_loss.item())

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

# Value function learning
        for _ in range(train_v_iters):
            v = net.apply_critic(x_ph)
            v_loss = criterion_mse(v, ret_ph)

            optimizer_critic.zero_grad()
            v_loss.backward()
            average_gradients(optimizer_critic.param_groups)
            optimizer_critic.step()

        # Log changes from update
        with torch.no_grad():
            net.eval()
            v = net.apply_critic(x_ph)
        v_l_new = criterion_mse(v, ret_ph)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old.item(),
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old).item())

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32))
            with torch.no_grad():
                net.eval()
                a, _, logp_t, info_t, _ = net.apply_actor(x_ph)
                v_t = net.apply_critic(x_ph)

            # save and log
            a = a.numpy()[0]
            v_t = v_t.data.numpy()
            logp_t = logp_t.data.numpy()
            ot = o.copy()
            buf.store(o, a, r, v_t, logp_t, info_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a)
            # buf.store(ot, a, r, v_t, logp_t, info_t)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else v_t
                # if d:
                # last_val = 0
                # else:
                # with torch.no_grad():
                # net.eval()
                # x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32))
                # v_t = net.apply_critic(x_ph)
                # last_val = v_t.detach().numpy()

                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, net, None)

        # Perform TRPO or NPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo == 'trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Beispiel #8
0
def vpg(env_fn,
        actor_critic=core.Actor_Critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=1e-3,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # make core of policy network
    net = actor_critic(obs_dim[0], **ac_kwargs)
    print(net)

    # loss function
    criterion_mse = nn.MSELoss()

    # optim
    optimizer_actor = optim.Adam(net.actor.parameters(), lr=pi_lr)
    optimizer_critic = optim.Adam(net.critic.parameters(), lr=vf_lr)

    # Sync params across processes
    sync_all_params(net.parameters())

    # Count variables
    # var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    # logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    def update():
        obs_buf, act_buf, adv_buf, ret_buf, logp_buf = buf.get()
        net.train()

        # update actor
        x_ph = torch.from_numpy(obs_buf)
        a_ph = torch.from_numpy(act_buf)
        adv_ph = torch.from_numpy(adv_buf)
        _, logp, _ = net.apply_actor(x_ph, a_ph)
        pi_loss = -torch.mean(logp * adv_ph)

        optimizer_actor.zero_grad()
        pi_loss.backward()
        average_gradients(optimizer_actor.param_groups)
        optimizer_actor.step()

        pi_l_old = pi_loss
        approx_ent = torch.mean(
            -logp)  # a sample estimate for entropy, also easy to compute

        # Value function learning
        ret_ph = torch.from_numpy(ret_buf)
        v_ph = net.apply_critic(x_ph)
        v_l_old = criterion_mse(v_ph, ret_ph)
        for _ in range(train_v_iters):
            v_ph = net.apply_critic(x_ph)
            v_loss = criterion_mse(v_ph, ret_ph)

            optimizer_critic.zero_grad()
            v_loss.backward()
            average_gradients(optimizer_critic.param_groups)
            optimizer_critic.step()

        # Log changes from update
        with torch.no_grad():
            net.eval()
            _, logp, _ = net.apply_actor(x_ph, a_ph)
            v_ph = net.apply_critic(x_ph)
        pi_l_new = -torch.mean(logp * adv_ph)
        v_l_new = criterion_mse(v_ph, ret_ph)

        # Info (useful to watch during learning)
        logp_old_ph = torch.from_numpy(logp_buf)
        approx_kl = torch.mean(
            logp_old_ph -
            logp)  # a sample estimate for KL-divergence, easy to compute

        logger.store(LossPi=pi_l_old.item(),
                     LossV=v_l_old.item(),
                     KL=approx_kl.item(),
                     Entropy=approx_ent.item(),
                     DeltaLossPi=(pi_l_new - pi_l_old).item(),
                     DeltaLossV=(v_l_new - v_l_old).item())

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32))
            with torch.no_grad():
                net.eval()
                a, _, logp_t = net.apply_actor(x_ph)
                v_t = net.apply_critic(x_ph)

            # save and log
            a = a.numpy()[0]
            v_t = v_t.data.numpy()
            logp_t = logp_t.data.numpy()
            ot = o.copy()
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else v_t
                # if d:
                # last_val = 0
                # else:
                # with torch.no_grad():
                # net.eval()
                # x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32))
                # v_t = net.apply_critic(x_ph)
                # last_val = v_t.detach().numpy()

                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, net, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()