def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="PPO1",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            with self.sess.as_default():
                self.adam.sync()
                callback.on_training_start(locals(), globals())

                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi,
                                                 self.env,
                                                 self.timesteps_per_actorbatch,
                                                 callback=callback)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                len_buffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                reward_buffer = deque(maxlen=100)

                while True:
                    if timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(
                            1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    seg = seg_gen.__next__()

                    # Stop training early (triggered by the callback)
                    if not seg.get('continue_training', True):  # pytype: disable=attribute-error
                        break

                    add_vtarg_and_adv(seg, self.gamma, self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    observations, actions = seg["observations"], seg["actions"]
                    atarg, tdlamret = seg["adv"], seg["tdlamret"]

                    # true_rew is the reward without discount
                    if writer is not None:
                        total_episode_reward_logger(
                            self.episode_reward, seg["true_rewards"].reshape(
                                (self.n_envs, -1)), seg["dones"].reshape(
                                    (self.n_envs, -1)), writer,
                            self.num_timesteps)

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(dict(ob=observations,
                                           ac=actions,
                                           atarg=atarg,
                                           vtarg=tdlamret),
                                      shuffle=not self.policy.recurrent)
                    optim_batchsize = self.optim_batchsize or observations.shape[
                        0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for k in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for i, batch in enumerate(
                                dataset.iterate_once(optim_batchsize)):
                            steps = (
                                self.num_timesteps + k * optim_batchsize +
                                int(i *
                                    (optim_batchsize / len(dataset.data_map))))
                            if writer is not None:
                                # run loss backprop with summary, but once every 10 runs save the metadata
                                # (memory, compute time, ...)
                                if self.full_tensorboard_log and (1 +
                                                                  k) % 10 == 0:
                                    run_options = tf.compat.v1.RunOptions(
                                        trace_level=tf.compat.v1.RunOptions.
                                        FULL_TRACE)
                                    run_metadata = tf.compat.v1.RunMetadata()
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess,
                                        options=run_options,
                                        run_metadata=run_metadata)
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                else:
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *newlosses = self.lossandgrad(
                                    batch["ob"],
                                    batch["ob"],
                                    batch["ac"],
                                    batch["atarg"],
                                    batch["vtarg"],
                                    cur_lrmult,
                                    sess=self.sess)

                            self.adam.update(grad,
                                             self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"],
                                                        batch["ob"],
                                                        batch["ac"],
                                                        batch["atarg"],
                                                        batch["vtarg"],
                                                        cur_lrmult,
                                                        sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses,
                                                    self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)
                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()
        callback.on_training_end()
        return self
Example #2
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        num_options=2,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)

    # env._seed(seed)

    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app

    dirname = '{}_{}opts_saves/'.format(gamename, num_options)

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'mlp_policy.py', 'run_main.py']
        for i in range(len(files)):
            src = os.path.expanduser('~/baselines/baselines/ppo1/') + files[i]
            dest = os.path.expanduser('~/baselines/baselines/ppo1/') + dirname
            shutil.copy2(src, dest)

    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # pdb.set_trace()
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)
    op_loss = -tf.reduce_sum(log_pi[0][option[0]] * atarg + entropy * 0.1)

    total_loss += op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option, term_adv],
                             losses + [U.flatgrad(total_loss, var_list)])
    termloss = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)
    U.initialize()
    adam.sync()
    saver = tf.train.Saver(max_to_keep=10000)

    results = []
    if saves:
        results = open(
            gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w')

        out = 'epoch,avg_reward'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)

        out += '\n'

        results.write(out)

        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch >= 0:
        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(
                seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))

        print("mean opt dur:", opt_d)
        print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']), axis=0))
        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if iters_so_far % 5 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        min_batch = 160
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            # This part is only necessasry when we use options.
            # We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n
                      < min_batch) or (indices.size > min_batch
                                       and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    tadv, nodc_adv = pi.get_term_adv(batch["ob"], [opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult,
                                                    [opt], tadv)
                    termg = termloss(batch["ob"], [opt], tadv)
                    adam.update(termg[0], 5e-7 * cur_lrmult)
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        if saves:
            out = "{},{}"
            for _ in range(num_options):
                out += ",{},{},{},{}"
            out += "\n"

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options):
                info.append(opt_d[i])
            for i in range(num_options):
                info.append(std[i])
            for i in range(num_options):
                info.append(np.mean(np.array(seg['term_p']), axis=0)[i])
            for i in range(num_options):
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100):
        with SetVerbosity(self.verbose):
            self._setup_learn(seed)

            with self.sess.as_default():
                self.adam.sync()

                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi, self.env,
                                                 self.timesteps_per_actorbatch)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                lenbuffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                rewbuffer = deque(maxlen=100)

                while True:
                    if callback:
                        callback(locals(), globals())
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(
                            1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    seg = seg_gen.__next__()
                    add_vtarg_and_adv(seg, self.gamma, self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg[
                        "ac"], seg["adv"], seg["tdlamret"]

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(
                        dict(ob=obs_ph,
                             ac=action_ph,
                             atarg=atarg,
                             vtarg=tdlamret),
                        shuffle=not issubclass(self.policy, LstmPolicy))
                    optim_batchsize = self.optim_batchsize or obs_ph.shape[0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for _ in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for batch in dataset.iterate_once(optim_batchsize):
                            *newlosses, grad = self.lossandgrad(batch["ob"],
                                                                batch["ob"],
                                                                batch["ac"],
                                                                batch["atarg"],
                                                                batch["vtarg"],
                                                                cur_lrmult,
                                                                sess=self.sess)
                            self.adam.update(grad,
                                             self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"],
                                                        batch["ob"],
                                                        batch["ac"],
                                                        batch["atarg"],
                                                        batch["vtarg"],
                                                        cur_lrmult,
                                                        sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses,
                                                    self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)
                    logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    timesteps_so_far += seg["total_timestep"]
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()

        return self
Example #4
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="PPO1",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy({}).".format(self.policy)

            with self.sess.as_default():
                self.adam.sync()
                trajectory_dic = None
                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi, self.env,
                                                 self.timesteps_per_actorbatch)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()

                # rolling buffer for episode lengths
                lenbuffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                rewbuffer = deque(maxlen=100)

                self.episode_reward = np.zeros((self.n_envs, ))
                if self.save_trajectory:
                    hidden_list = []
                    obs_list = []
                    act_list = []
                    rwds_list = []
                    dones_list = []

                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(
                            1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError

                    # logger.log("********** Iteration %i ************" % iters_so_far)
                    logger.log("********** Iteration %i %i************" %
                               (iters_so_far, self.n_envs))

                    seg = seg_gen.__next__()

                    add_vtarg_and_adv(seg, self.gamma, self.lam)

                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    obs_ph, hiddens_ph, action_ph, atarg, tdlamret = seg[
                        "ob"], seg["hiddens"], seg["ac"], seg["adv"], seg[
                            "tdlamret"]
                    # print(">>>hiddens_ph:",len(hiddens_ph))
                    if self.save_trajectory:
                        rwds_ph, dones_ph = seg["rew"], seg["dones"]
                        obs_list.append(obs_ph.copy())
                        hidden_list.append(hiddens_ph.copy())
                        act_list.append(action_ph.copy())
                        rwds_list.append(rwds_ph.copy())
                        dones_list.append(dones_ph.copy())

                    # true_rew is the reward without discount
                    if writer is not None:
                        self.episode_reward = total_episode_reward_logger(
                            self.episode_reward, seg["true_rew"].reshape(
                                (self.n_envs, -1)), seg["dones"].reshape(
                                    (self.n_envs, -1)), writer,
                            self.num_timesteps)

                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]

                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(dict(ob=obs_ph,
                                           ac=action_ph,
                                           atarg=atarg,
                                           vtarg=tdlamret),
                                      shuffle=not self.policy.recurrent)
                    optim_batchsize = self.optim_batchsize or obs_ph.shape[0]

                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))

                    # Here we do a bunch of optimization epochs over the data
                    for k in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for i, batch in enumerate(
                                dataset.iterate_once(optim_batchsize)):
                            steps = (
                                self.num_timesteps + k * optim_batchsize +
                                int(i *
                                    (optim_batchsize / len(dataset.data_map))))
                            if writer is not None:
                                # run loss backprop with summary, but once every 10 runs save the metadata
                                # (memory, compute time, ...)
                                if self.full_tensorboard_log and (1 +
                                                                  k) % 10 == 0:
                                    run_options = tf.RunOptions(
                                        trace_level=tf.RunOptions.FULL_TRACE)
                                    run_metadata = tf.RunMetadata()

                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess,
                                        options=run_options,
                                        run_metadata=run_metadata)
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                else:
                                    summary, grad, *newlosses = self.lossandgrad(
                                        batch["ob"],
                                        batch["ob"],
                                        batch["ac"],
                                        batch["atarg"],
                                        batch["vtarg"],
                                        cur_lrmult,
                                        sess=self.sess)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *newlosses = self.lossandgrad(
                                    batch["ob"],
                                    batch["ob"],
                                    batch["ac"],
                                    batch["atarg"],
                                    batch["vtarg"],
                                    cur_lrmult,
                                    sess=self.sess)

                            self.adam.update(grad,
                                             self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))

                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"],
                                                        batch["ob"],
                                                        batch["ac"],
                                                        batch["atarg"],
                                                        batch["vtarg"],
                                                        cur_lrmult,
                                                        sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses,
                                                    self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular(
                        "ev_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])

                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)
                    if len(lenbuffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()

                if self.save_trajectory:

                    length = np.vstack(obs_list).shape[0]
                    print("Save trajectory...(length:{})".format(length))
                    trajectory_dic = {
                        "all_obvs": np.vstack(obs_list).reshape(length, -1),
                        "all_hiddens":
                        np.vstack(hidden_list).reshape(length, -1),
                        "all_acts": np.vstack(act_list).reshape(length, -1),
                        "all_rwds": np.vstack(rwds_list).reshape(length, -1),
                        "all_dones": np.vstack(dones_list).reshape(length, -1)
                    }

                    # with open('../saved/{}-trajectory.pkl'.format(str(self.__class__).split("'")[-2].split(".")[-1]), 'wb+') as f:
                    #         pkl.dump(trajectory_dic, f, protocol=2)

        return self, trajectory_dic
Example #5
0
    def run(self, num_steps, data_dir, policy_record=None):
        local_steps = int(num_steps / self.comm.Get_size())

        steps = 0

        # TODO: Add alpha annealing over num_steps

        while True:

            # sync weights
            self.b_agent_attack.sync_weights()
            self.b_agent_evade.sync_weights()
            self.b_agent_transit.sync_weights()
            self.m_agent.sync_weights()

            # create placeholders to store experience that we gather
            training_state = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_action = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_reward = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_next_state = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_done = {
                "meta": [],
                "attack": [],
                "evade": [],
                "transit": []
            }
            training_reward_sum_combined = 0  # keep track of combined reward over all episodes between training

            state = self.env.reset()
            reward_sum = {}
            done = False
            while not done:
                complete_action, distribution, beh_actions, label = self.m_agent.get_action(
                    state)

                next_state, reward, done, info = self.env.step(
                    complete_action, dst=distribution, label=label)

                # Aggregate reward throughout the episode
                if not reward_sum:
                    reward_sum = reward
                else:
                    reward_sum = {
                        k: reward_sum[k] + reward[k]
                        for (k, v) in reward.items()
                    }
                training_reward_sum_combined += reward["combined"]

                training_state["meta"].append(state)
                training_action["meta"].append(distribution)
                training_reward["meta"].append(reward["combined"])
                training_next_state["meta"].append(next_state)
                training_done["meta"].append(done)

                for idx, label in enumerate(['attack', 'evade', 'transit']):
                    training_state[label].append(state)
                    training_action[label].append(beh_actions[idx])
                    training_reward[label].append(reward[label])
                    training_next_state[label].append(next_state)
                    training_done[label].append(done)

                state = next_state

            # #now we have batches of data: compute the values and advantages
            # training_value = {"meta": None, "attack": None, "evade": None, "transit": None}
            # training_advantages = {"meta": None, "attack": None, "evade": None, "transit": None}

            # log tensorboard
            {logger.logkv(k, v) for (k, v) in reward_sum.items()}
            logger.dumpkvs()

            # vcompute advantages and values
            models = [
                self.b_agent_attack, self.b_agent_evade, self.b_agent_transit,
                self.m_agent
            ]
            for model in models:

                network = model.label

                states = training_state[network]
                actions = training_action[network]
                reward = training_reward[network]
                next_states = training_next_state[network]
                done = training_done[network]

                # Convert done bools to ints and invert
                done_int = np.invert(done).astype(np.int)

                # Generalized advantage estimation (gets advantages to train on and value estimates)
                target, advantages = GAE(states,
                                         actions,
                                         reward,
                                         next_states,
                                         done_int,
                                         model.sample_value,
                                         T=128,
                                         y=0.99,
                                         lam=0.95,
                                         use_Q=False)

                # train this model
                dataset = Dataset(dict(ob=np.asarray(states),
                                       ac=np.asarray(actions),
                                       atarg=np.asarray(advantages),
                                       vtarg=np.asarray(target)),
                                  shuffle=True)

                for k in range(4):
                    for i, batch in enumerate(dataset.iterate_once(
                            len(states))):
                        model.train(batch["ob"], batch["ac"], batch["vtarg"],
                                    batch["atarg"], 1.0)

            print('FINISHED TRAINING EPISODE')