コード例 #1
0
    def __call__(self, *args, **kwargs):
        data_gen = self._data_generator(self.train_data_size, stochastic=True)

        while True:
            batch = next(data_gen)
            batch = self._add_vtarg_and_adv(batch, self.gamma, self.lambda_)
            obs, acs, advs, td_lam_ret = batch['obs'], batch['acs'], batch[
                'advs'], batch['td_lam_ret']
            advs = (advs - advs.mean()
                    ) / advs.std()  # standardized advantage function estimate
            d = Dataset(dict(obs=obs,
                             acs=acs,
                             advs=advs,
                             td_lam_ret=td_lam_ret),
                        shuffle=not self.ppo.pi.recurrent)

            # update obs normalization
            self.ppo.update_ob_norm(obs)

            # update old by new
            self.ppo.update_old_by_new()

            # set current learning rate
            cur_lrmult = 1.0 if not self.lr_decay else max(
                1.0 - float(self.steps) / self.max_steps, 0)

            # learn
            optimize_size = self.optimize_size or obs.shape[0]
            for _ in range(self.optimize_epochs):
                for batch in d.iterate_once(optimize_size):
                    self.ppo.learn(batch["obs"], batch["acs"], batch["advs"],
                                   batch["td_lam_ret"], cur_lrmult,
                                   self.optimize_step_size)
コード例 #2
0
    def fit_data(self,
                 training_X,
                 training_Y,
                 iter_num=200,
                 batch_size=64,
                 stepsize=0.001,
                 save_model_callback=None):

        dataset = Dataset(dict(X=np.array(training_X), Y=np.array(training_Y)),
                          shuffle=True)
        losses = []
        for iter in range(iter_num):
            loss_epoch = []
            for batch in dataset.iterate_once(batch_size):
                inputs = [batch["X"], True, batch["Y"]]
                loss, g = self.lossandgrad(*inputs)
                self.updater.update(g, stepsize)
                loss_epoch.append(loss)
            losses.append(np.mean(loss_epoch))
            if iter % 5 == 0:
                print('iter: ', iter, 'loss: ', np.mean(loss_epoch))
            if save_model_callback is not None:
                save_model_callback(self.model, self.model.name, iter)

        return losses
コード例 #3
0
    def train(self, seg, optim_batchsize, optim_epochs):
        cur_lrmult = 1.0
        add_vtarg_and_adv(seg, self.gamma, self.lam)
        ob, unnorm_ac, atarg, tdlamret = seg["ob"], seg["unnorm_ac"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not self.pi.recurrent)

        if hasattr(self.pi, "ob_rms"):
            self.pi.update_obs_rms(ob)  # update running mean/std for policy
        self.assign_old_eq_new(
        )  # set old parameter values to new parameter values
        logger.log2("Optimizing...")
        logger.log2(fmt_row(13, self.loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                lg = self.lossandgrad(batch["ac"], batch["atarg"],
                                      batch["vtarg"], cur_lrmult,
                                      *self.fix_ob2feed(batch["ob"]))
                new_losses, g = lg[:-1], lg[-1]
                self.adam.update(g, self.optim_stepsize * cur_lrmult)
                losses.append(new_losses)
            logger.log2(fmt_row(13, np.mean(losses, axis=0)))

        logger.log2("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = self.compute_losses(batch["ac"], batch["atarg"],
                                            batch["vtarg"], cur_lrmult,
                                            *self.fix_ob2feed(batch["ob"]))
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log2(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, self.loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        return meanlosses
コード例 #4
0
def update_policy(pi, seg, gamma, lam, logger, optim_epochs, optim_batchsize,
                  optim_stepsize, cur_lrmult, loss_names, lossandgrad, adam,
                  assign_old_eq_new, compute_losses, mpi_moments_fn):

    add_vtarg_and_adv(seg, gamma, lam)

    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
    ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
    vpredbefore = seg["vpred"]  # predicted value function before udpate
    atarg = (atarg - atarg.mean()
             ) / atarg.std()  # standardized advantage function estimate
    d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                shuffle=not pi.recurrent)
    optim_batchsize = optim_batchsize or ob.shape[0]

    if hasattr(pi, "ob_rms"):
        pi.ob_rms.update(ob)  # update running mean/std for policy

    assign_old_eq_new()  # set old parameter values to new parameter values
    logger.log("Optimizing...")
    logger.log(fmt_row(13, loss_names))
    # Here we do a bunch of optimization epochs over the data
    for _ in range(optim_epochs):
        losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for batch in d.iterate_once(optim_batchsize):
            # *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            adam.update(g, optim_stepsize * cur_lrmult)
            losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(losses, axis=0)))

    logger.log("Evaluating losses...")
    losses = []
    for batch in d.iterate_once(optim_batchsize):
        newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"],
                                   batch["vtarg"], cur_lrmult)
        losses.append(newlosses)
    meanlosses, _, _ = mpi_moments_fn(losses)
    logger.log(fmt_row(13, meanlosses))
    for (lossval, name) in zipsame(meanlosses, loss_names):
        logger.record_tabular("loss_" + name, lossval)
    return vpredbefore, tdlamret, optim_batchsize
コード例 #5
0
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    td_v_target = tf.placeholder(dtype=tf.float32,
                                 shape=[1, 1])  # V target for RAC

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    # adv = tf.placeholder(dtype = tf.float32, shape = [1, 1]) # Advantage function for RAC

    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    vf_rac_loss = tf.reduce_mean(tf.square(pi.vpred - td_v_target))
    vf_rac_losses = [vf_rac_loss]
    vf_rac_loss_names = ["vf_rac_loss"]

    pol_rac_loss_surr1 = atarg * pi.pd.neglogp(ac) * ratio
    pol_rac_loss_surr2 = tf.clip_by_value(
        ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg * pi.pd.neglogp(
            ac)  #
    pol_rac_loss = tf.reduce_mean(
        tf.minimum(pol_rac_loss_surr1, pol_rac_loss_surr2))
    pol_rac_losses = [pol_rac_loss]
    pol_rac_loss_names = ["pol_rac_loss"]

    var_list = pi.get_trainable_variables()

    vf_final_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
        and v.name.split("/")[2].startswith("final")
    ]
    pol_final_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
        and v.name.split("/")[2].startswith("final")
    ]

    # Train V function
    vf_lossandgrad = U.function([ob, td_v_target, lrmult], vf_rac_losses +
                                [U.flatgrad(vf_rac_loss, vf_final_var_list)])
    vf_adam = MpiAdam(vf_final_var_list, epsilon=adam_epsilon)

    # Train Policy
    pol_lossandgrad = U.function(
        [ob, ac, atarg, lrmult],
        pol_rac_losses + [U.flatgrad(pol_rac_loss, pol_final_var_list)])
    pol_adam = MpiAdam(pol_final_var_list, epsilon=adam_epsilon)

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    compute_v_pred = U.function([ob], [pi.vpred])

    U.initialize()
    adam.sync()
    pol_adam.sync()
    vf_adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer,best_fitness
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    seg = None
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps),
                             0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        t = 0
        ac = env.action_space.sample(
        )  # not used, just so we have the datatype
        new = True  # marks if we're on first timestep of an episode
        ob = env.reset()

        cur_ep_ret = 0  # return in current episode
        cur_ep_len = 0  # len of current episode
        ep_rets = []  # returns of completed episodes in this segment
        ep_lens = []  # lengths of ...
        horizon = timesteps_per_actorbatch

        # Initialize history arrays
        obs = np.array([ob for _ in range(horizon)])
        rews = np.zeros(horizon, 'float32')
        vpreds = np.zeros(horizon, 'float32')
        news = np.zeros(horizon, 'int32')
        acs = np.array([ac for _ in range(horizon)])
        prevacs = acs.copy()

        rac_alpha = optim_stepsize * cur_lrmult * 0.1
        rac_beta = optim_stepsize * cur_lrmult * 0.01
        assign_old_eq_new()  # set old parameter values to new parameter values
        while True:
            if timesteps_so_far % 10000 == 0 and timesteps_so_far > 0:
                result_record()
            prevac = ac
            ac, vpred = pi.act(stochastic=True, ob=ob)
            # Slight weirdness here because we need value function at time T
            # before returning segment [0, T-1] so we get the correct
            # terminal value
            if t > 0 and t % horizon == 0:
                seg = {
                    "ob": obs,
                    "rew": rews,
                    "vpred": vpreds,
                    "new": news,
                    "ac": acs,
                    "prevac": prevacs,
                    "nextvpred": vpred * (1 - new),
                    "ep_rets": ep_rets,
                    "ep_lens": ep_lens
                }
                ep_rets = []
                ep_lens = []
                break
            i = t % horizon
            obs[i] = ob
            vpreds[i] = vpred
            news[i] = new
            acs[i] = ac
            prevacs[i] = prevac
            if env.spec._env_name == "LunarLanderContinuous":
                ac = np.clip(ac, -1.0, 1.0)
            next_ob, rew, new, _ = env.step(ac)

            # Compute v target and TD
            v_target = rew + gamma * np.array(
                compute_v_pred(next_ob.reshape((1, ob.shape[0]))))
            adv = v_target - np.array(
                compute_v_pred(ob.reshape((1, ob.shape[0]))))

            # Update V and Update Policy
            vf_loss, vf_g = vf_lossandgrad(ob.reshape((1, ob.shape[0])),
                                           v_target, rac_alpha)
            vf_adam.update(vf_g, rac_alpha)
            pol_loss, pol_g = pol_lossandgrad(ob.reshape((1, ob.shape[0])),
                                              ac.reshape((1, ac.shape[0])),
                                              adv.reshape(adv.shape[0], ),
                                              rac_beta)
            pol_adam.update(pol_g, rac_beta)

            rews[i] = rew

            cur_ep_ret += rew
            cur_ep_len += 1
            timesteps_so_far += 1
            ob = next_ob
            if new:
                # print(
                #     "Episode {} - Total reward = {}, Total Steps = {}".format(episodes_so_far, cur_ep_ret, cur_ep_len))
                ep_rets.append(cur_ep_ret)
                ep_lens.append(cur_ep_len)
                rewbuffer.extend(ep_rets)
                lenbuffer.extend(ep_lens)
                cur_ep_ret = 0
                cur_ep_len = 0
                ob = env.reset()
                episodes_so_far += 1
            t += 1

        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            # logger.log(fmt_row(13, np.mean(losses, axis=0)))
        # logger.log("Current Iteration Training Performance:" + str(np.mean(seg["ep_rets"])))
        if iters_so_far == 0:
            result_record()
        iters_so_far += 1
コード例 #6
0
    def learn(self):
        # Prepare for rollouts
        # ----------------------------------------
        seg_gen = traj_segment_generator(self.pi,
                                         self.env,
                                         self.timesteps_per_actorbatch,
                                         stochastic=True)
        episodes_so_far = 0
        timesteps_so_far = 0
        iters_so_far = 0
        tstart = time.time()
        lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
        rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
        assert sum([
            self.max_iters > 0, self.max_timesteps > 0, self.max_episodes > 0,
            self.max_seconds > 0
        ]) == 1, "Only one time constraint permitted"
        while True:
            if (timesteps_so_far >= self.max_timesteps) and self.max_timesteps:
                break
            elif (episodes_so_far >= self.max_episodes) and self.max_episodes:
                break
            elif (iters_so_far >= self.max_iters) and self.max_iters:
                break
            elif self.max_seconds and (time.time() - tstart >=
                                       self.max_seconds):
                break

            if self.schedule == 'constant':
                cur_lrmult = 1.0
            elif self.schedule == 'linear':
                cur_lrmult = max(
                    1.0 - float(timesteps_so_far) / self.max_timesteps, 0)
            else:
                raise NotImplementedError

            logger.log("********** Iteration %i ************" % iters_so_far)

            seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, self.gamma, self.lam)

            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            self.ob, self.ac, self.atarg, tdlamret = seg["ob"], seg["ac"], seg[
                "adv"], seg["tdlamret"]

            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            self.atarg = (self.atarg - self.atarg.mean()) / self.atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=self.ob,
                             ac=self.ac,
                             atarg=self.atarg,
                             vtarg=tdlamret),
                        shuffle=not self.pi.recurrent)
            self.optim_batchsize = self.optim_batchsize or self.ob.shape[0]

            if hasattr(self.pi, "ob_rms"):
                self.pi.ob_rms.update(
                    self.ob)  # update running mean/std for policy

            self.assign_old_eq_new(
            )  # set old parameter values to new parameter values
            logger.log("Optimizing...")
            logger.log(fmt_row(13, self.loss_names))
            # Here we do a bunch of optimization epochs over the data
            for _ in range(self.optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(self.optim_batchsize):
                    *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"],
                                                     batch["atarg"],
                                                     batch["vtarg"],
                                                     cur_lrmult)
                    self.adam.update(g, self.optim_stepsize * cur_lrmult)
                    losses.append(newlosses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            logger.log("Evaluating losses...")
            losses = []
            for batch in d.iterate_once(self.optim_batchsize):
                newlosses = self.compute_losses(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)
                losses.append(newlosses)
            meanlosses, _, _ = mpi_moments(losses, axis=0)
            logger.log(fmt_row(13, meanlosses))
            for (lossval, name) in zipsame(meanlosses, self.loss_names):
                logger.record_tabular("loss_" + name, lossval)
            logger.record_tabular("ev_tdlam_before",
                                  explained_variance(vpredbefore, tdlamret))
            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(flatten_lists, zip(*listoflrpairs))
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            if MPI.COMM_WORLD.Get_rank() == 0:
                logger.dump_tabular()
コード例 #7
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        # CMAES
    max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        # time constraint
        callback=None,
        # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        # annealing for stepsize parameters (epsilon and adam)
        seed,
        env_id):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    backup_pi = policy_fn("backup_pi", ob_space,
                          ac_space)  # Network for cmaes individual to train

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    reward = tf.placeholder(dtype=tf.float32, shape=[None])  # step rewards
    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    next_ob = U.get_placeholder_cached(
        name="next_ob")  # next step observation for updating q function
    ac = U.get_placeholder_cached(
        name="act")  # action placeholder for computing q function

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    y = reward + gamma * tf.squeeze(pi.vpred)
    qf_loss = tf.reduce_mean(tf.square(y - pi.qpred))
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen  # v function is independently trained
    qf_losses = [qf_loss]
    vf_losses = [vf_loss]
    losses = [pol_surr, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    # print(var_list)
    if isinstance(pi, CnnPolicy):
        lin_var_list = [
            v for v in var_list if v.name.split("/")[1].startswith("lin")
        ]
        vf_var_list = [
            v for v in var_list if v.name.split("/")[1].startswith("logits")
        ]
        pol_var_list = [
            v for v in var_list if v.name.split("/")[1].startswith("value")
        ]
        # Policy + Value function, the final layer, all trainable variables
        # Remove vf variables
        var_list = lin_var_list + pol_var_list
    else:
        fc2_var_list = [
            v for v in var_list if v.name.split("/")[2].startswith("fc2")
        ]
        final_var_list = [
            v for v in var_list if v.name.split("/")[2].startswith("final")
        ]
        # var_list = vf_var_list + pol_var_list
        var_list = fc2_var_list + final_var_list
        print(var_list)
    # print(var_list)
    qf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("qf")
    ]
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    qf_lossandgrad = U.function([ob, ac, next_ob, lrmult, reward],
                                qf_losses + [U.flatgrad(qf_loss, qf_var_list)])
    vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                                vf_losses + [U.flatgrad(vf_loss, vf_var_list)])
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])

    qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon)

    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)

    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    # Assign pi to backup (only backup trainable variables)
    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv)
            for (backup_v,
                 newv) in zipsame(backup_pi.get_trainable_variables(),
                                  pi.get_trainable_variables())
        ])

    # Assign backup back to pi
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv,
                 backup_v) in zipsame(pi.get_trainable_variables(),
                                      backup_pi.get_trainable_variables())
        ])

    # Compute all losses
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    # compute the Advantage estimations: A = Q - V for pi
    get_A_estimation = U.function([ob, next_ob, ac], [pi.qpred - pi.vpred])
    # compute the Advantage estimations: A = Q - V for evalpi

    # compute the mean action for given states under pi
    mean_actions = U.function([ob], [pi.pd.mode()])
    # compute the mean action for given states under evalpi

    U.initialize()
    adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness
    episodes_so_far = 0
    timesteps_so_far = 0
    ppo_timesteps_so_far = 0
    # cmaes_timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    # Prepare for rollouts
    # ----------------------------------------
    # assign pi to eval_pi
    actors = []
    best_fitness = 0

    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    i = 0
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        # PPO Train V and Q
        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, next_ob, ac, atarg, tdlamret, reward = seg["ob"], seg[
            "next_ob"], seg["ac"], seg["adv"], seg["tdlamret"], seg["rew"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        # Re-train V function
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                               batch["atarg"], batch["vtarg"],
                                               cur_lrmult)
                vf_adam.update(g, optim_stepsize * cur_lrmult)

        # Random select tansitions to train Q
        random_idx = []
        len_repo = len(seg["ob"])
        optim_epochs_q = int(len_repo / optim_batchsize)
        for _ in range(optim_epochs_q):
            random_idx.append(
                np.random.choice(range(len_repo), optim_batchsize))

        # Re-train q function
        for _ in range(optim_epochs_q):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for idx in random_idx:
                *qf_losses, g = qf_lossandgrad(seg["next_ob"][idx],
                                               seg["ac"][idx], seg["ob"][idx],
                                               cur_lrmult, seg["rew"][idx])
                qf_adam.update(g, optim_stepsize * cur_lrmult)

        # CMAES
        weights = pi.get_trainable_variables()
        if i >= len(weights):
            i = 0
        while i < len(weights):
            # Consider both q-function and v-function
            if weights[i].name.split("/")[1] == "vf" or weights[i].name.split(
                    "/")[1] == "qf":
                i += 1
                continue
            print("Layer: ", i, '+', i + 1)
            print("Layer-Name", weights[i].name)
            if i + 1 < len(weights):
                layer_params = [weights[i], weights[i + 1]]
            else:
                layer_params = [weights[i]]
                if len(layer_params) <= 1:
                    layer_params = [weights[i - 1], weights[i]]
            layer_params_flat = pi.get_Layer_Flat_variables(layer_params)()
            index, init_uniform_layer_weights = uniform_select(
                layer_params_flat, 500)
            opt = cma.CMAOptions()
            opt['tolfun'] = max_fitness
            opt['popsize'] = popsize
            opt['maxiter'] = gensize
            opt['verb_disp'] = 0
            opt['verb_log'] = 0
            opt['seed'] = seed
            opt['AdaptSigma'] = True
            # opt['bounds'] = bounds
            sigma1 = sigma - 0.001 * iters_so_far
            if sigma1 < 0.0001:
                sigma1 = 0.0001
            print("Sigma=", sigma1)
            es = cma.CMAEvolutionStrategy(init_uniform_layer_weights, sigma1,
                                          opt)
            best_solution = np.copy(
                init_uniform_layer_weights.astype(np.float64))
            costs = None
            while True:
                if es.countiter >= opt['maxiter']:
                    break
                solutions = es.ask()
                segs = []
                ob_segs = None
                costs = []
                lens = []
                # Evaluation
                assign_backup_eq_new(
                )  #backup current policy, after Q and V have been trained

                a_func = get_A_estimation(
                    ob, ob,
                    np.array(mean_actions(ob)).transpose().reshape(
                        (len(ob), 1)))
                # a_func = (a_func - np.mean(a_func)) / np.std(a_func)
                print("A-pi0:", np.mean(a_func))
                print()
                for id, solution in enumerate(solutions):
                    new_variable = set_uniform_weights(layer_params_flat,
                                                       solution, index)
                    pi.set_Layer_Flat_variables(layer_params, new_variable)
                    new_a_func = get_A_estimation(
                        ob, ob,
                        np.array(mean_actions(ob)).transpose().reshape(
                            (len(ob), 1)))
                    # new_a_func = (new_a_func - np.mean(new_a_func)) / np.std(new_a_func)
                    print("A-pi" + str(id + 1), ":", np.mean(new_a_func))
                    costs.append(-np.mean(new_a_func))
                    assign_new_eq_backup()  # Restore the backup
                # l2_decay = compute_weight_decay(0.999, solutions).reshape((np.array(costs).shape))
                # costs += l2_decay
                # costs, real_costs = fitness_normalization(costs)
                print(costs)
                costs, real_costs = fitness_rank(costs)
                # es.tell(solutions=solutions, function_values = costs)
                es.tell_real_seg(solutions=solutions,
                                 function_values=costs,
                                 real_f=costs,
                                 segs=None)
                # if -es.result[1] >= best_fitness:
                print("Update Policy by CMAES")
                best_solution = np.copy(es.result[0])
                best_fitness = -es.result[1]
                best_layer_params_flat = set_uniform_weights(
                    layer_params_flat, best_solution, index)
                pi.set_Layer_Flat_variables(layer_params,
                                            best_layer_params_flat)
                print("Generation:", es.countiter)
                print("Best Solution Fitness:", best_fitness)
                # set old parameter values to new parameter values
            i += 2
            # break
        assign_old_eq_new()
        # Reestimate Advantage function based on the newly updated Pi
        # seg = seg_gen.__next__()
        # add_vtarg_and_adv(seg, gamma, lam)
        #
        # # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        # ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
        #     "tdlamret"]
        # vpredbefore = seg["vpred"]  # predicted value function before udpate
        # atarg = (
        #                 atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
        # d = Dataset(dict(ob = ob, ac = ac, atarg = atarg, vtarg = tdlamret),
        #                     shuffle = not pi.recurrent)
        # optim_batchsize = optim_batchsize or ob.shape[0]

        # PPO training
        # assign_old_eq_new()  # set old parameter values to new parameter values
        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, loss_names))
        # # Optimize the value function to keep it up.
        # for _ in range(optim_epochs):
        #     losses = []  # list of tuples, each of which gives the loss for a minibatch
        #     for batch in d.iterate_once(optim_batchsize):
        #         *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"],
        #                                        batch["atarg"], batch["vtarg"],
        #                                        cur_lrmult)
        #         vf_adam.update(g, optim_stepsize * cur_lrmult)
        # logger.log(fmt_row(13, np.mean(vf_losses, axis=0)))

        # Here we do a bunch of optimization epochs over the data
        # for _ in range(optim_epochs):
        #     losses = []  # list of tuples, each of which gives the loss for a minibatch
        #     for batch in d.iterate_once(optim_batchsize):
        #         *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
        #                                     batch["atarg"], batch["vtarg"],
        #                                     cur_lrmult)
        #         adam.update(g, optim_stepsize * cur_lrmult)
        #         losses.append(newlosses)
        #     logger.log(fmt_row(13, np.mean(losses, axis = 0)))

        # logger.log("Evaluating losses...")
        # losses = []
        # for batch in d.iterate_once(optim_batchsize):
        #     newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"],
        #                                batch["vtarg"], cur_lrmult)
        #     losses.append(newlosses)
        # meanlosses, _, _ = mpi_moments(losses, axis = 0)
        iters_so_far += 1
        episodes_so_far += sum(lens)
コード例 #8
0
ファイル: pposgd_simple.py プロジェクト: kkonen/baselines
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    restore_model_from_file=None,
    save_model_with_prefix,  # this is the naming of the saved model file. Usually here we set indication of the target goal:
    # for example 3dof_ppo1_H.
    # That way we can only select which networks we can execute to the real robot. We do not have to send all files or folder.
    # Naming of the model file should be self explanatory.
    job_id=None,  # this variable is used for indentifing Spearmint iteration number. It is usually set by the Spearmint iterator
    outdir="/tmp/rosrl/experiments/continuous/ppo1/"):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()
    """
    Here we add a possibility to resume from a previously saved model if a model file is provided
    """
    if restore_model_from_file:
        # saver = tf.train.Saver(tf.all_variables())
        saver = tf.train.import_meta_graph(restore_model_from_file)
        saver.restore(
            tf.get_default_session(),
            tf.train.latest_checkpoint('./'))  #restore_model_from_file)
        logger.log("Loaded model from {}".format(restore_model_from_file))

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    if save_model_with_prefix:
        if job_id is not None:
            basePath = '/tmp/rosrl/' + str(
                env.__class__.__name__) + '/ppo1/' + job_id
        else:
            basePath = '/tmp/rosrl/' + str(env.__class__.__name__) + '/ppo1/'

    # Create the writer for TensorBoard logs
    summary_writer = tf.summary.FileWriter(outdir,
                                           graph=tf.get_default_graph())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    deterministic=pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpRewSEM", np.std(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        """
        Save the model at every iteration
        """

        if save_model_with_prefix:
            # if np.mean(rewbuffer) > 10.0:
            if iters_so_far % 10 == 0 or np.mean(rewbuffer) > 10.0:
                basePath = outdir + "/models/"

                if not os.path.exists(basePath):
                    os.makedirs(basePath)
                modelF = basePath + save_model_with_prefix + "_afterIter_" + str(
                    iters_so_far) + ".model"
                U.save_state(modelF)
                logger.log("Saved model to file :{}".format(modelF))

        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="EpRewMean", simple_value=np.mean(rewbuffer))
        ])
        summary_writer.add_summary(summary, timesteps_so_far)
    return pi
コード例 #9
0
    def train(self, seg):
        # if callback: callback(locals(), globals())
        if self.schedule == 'constant':
            cur_lrmult = 1.0
        elif self.schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(self.iters_so_far) / self.max_iters_ppo, 1e-3)
        else:
            raise NotImplementedError

        self.iters_so_far += 1
        logger.log("********** Iteration %i ************" % self.iters_so_far)

        add_vtarg_and_adv(seg, self.gamma, self.lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=True)
        optim_batchsize = min(self.optim_batchsize, ob.shape[0])

        if hasattr(self.pi, "ob_rms"):
            self.pi.ob_rms.update(ob)  # update running mean/std for policy

        self.assign_old_eq_new(
        )  # set old parameter values to new parameter values
        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, self.loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(self.optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"],
                                                 batch["atarg"],
                                                 batch["vtarg"], cur_lrmult)
                self.adam.update(g, self.optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            # logger.log(fmt_row(13, np.mean(losses, axis=0)))

        # logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = self.compute_losses(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)

        n_data = seg["ob"].shape[0]
        self.timesteps_so_far += n_data
        for (lossval, name) in zipsame(meanlosses, self.loss_names):
            logger.log("loss_" + name + ": %f" % lossval)
        logger.log("ev_tdlam_before: %f" %
                   explained_variance(vpredbefore, tdlamret))
        logger.log("EpisodesThisIter: %d" % len(seg["rewAccumulated"]))
        logger.log("TimestepsThisIter: %d" % n_data)
        logger.log("TimestepsSoFar: %d" % self.timesteps_so_far)
        logger.log("LearningRate: %f" % (self.optim_stepsize * cur_lrmult))
コード例 #10
0
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    **kwargs,
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy

    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    atarg_novel = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function for the novelty reward term
    ret_novel = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Empirical return for the novelty reward term

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()

    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold

    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #

    surr1_novel = ratio * atarg_novel  # surrogate loss of the novelty term
    surr2_novel = tf.clip_by_value(
        ratio, 1.0 - clip_param,
        1.0 + clip_param) * atarg_novel  # surrogate loss of the novelty term

    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    pol_surr_novel = -tf.reduce_mean(tf.minimum(
        surr1_novel, surr2_novel))  # PPO's surrogate for the novelty part

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    vf_loss_novel = tf.reduce_mean(tf.square(pi.vpred_novel - ret_novel))

    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]

    total_loss_novel = pol_surr_novel + pol_entpen + vf_loss_novel
    losses_novel = [pol_surr_novel, pol_entpen, vf_loss_novel, meankl, meanent]

    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    policy_var_list = pi.get_trainable_variables(scope='pi/pol')

    policy_var_count = 0
    for vars in policy_var_list:
        count_in_var = 1
        for dim in vars.shape._dims:
            count_in_var *= dim
        policy_var_count += count_in_var

    noise_count = pi.get_trainable_variables(
        scope='pi/pol/logstd')[0].shape._dims[1]

    var_list = pi.get_trainable_variables(
        scope='pi/pol') + pi.get_trainable_variables(scope='pi/vf/')
    var_list_novel = pi.get_trainable_variables(
        scope='pi/pol') + pi.get_trainable_variables(scope='pi/vf_novel/')
    var_list_pi = pi.get_trainable_variables(
        scope='pi/pol') + pi.get_trainable_variables(
            scope='pi/vf/') + pi.get_trainable_variables(scope='pi/vf_novel/')

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])

    lossandgrad_novel = U.function(
        [ob, ac, atarg_novel, ret_novel, lrmult],
        losses_novel + [U.flatgrad(total_loss_novel, var_list_novel)])

    # adam = MpiAdam(var_list, epsilon=adam_epsilon)
    # adam_novel = MpiAdam(var_list_novel, epsilon=adam_epsilon)
    adam_all = MpiAdam(var_list_pi, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
    compute_losses_novel = U.function([ob, ac, atarg_novel, ret_novel, lrmult],
                                      losses_novel)

    U.initialize()

    adam_all.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0

    novelty_update_iter_cycle = 10
    novelty_start_iter = 50
    novelty_update = True

    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    rewnovelbuffer = deque(
        maxlen=100)  # rolling buffer for episode novelty rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # This for debug purpose
    # from collections import defaultdict
    # sum_batch = {}
    # sum_batch = defaultdict(lambda: 0, sum_batch)
    total_task_gradients = []
    total_novelty_gradients = []
    while True:

        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, atarg_novel, tdlamret, tdlamret_novel = seg["ob"], seg[
            "ac"], seg["adv"], seg["adv_novel"], seg["tdlamret"], seg[
                "tdlamret_novel"]

        vpredbefore = seg["vpred"]  # predicted value function before udpate
        vprednovelbefore = seg[
            'vpred_novel']  # predicted novelty value function before update

        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        atarg_novel = (atarg_novel - atarg_novel.mean()) / atarg_novel.std(
        )  # standartized novelty advantage function estimate

        d = Dataset(dict(ob=ob,
                         ac=ac,
                         atarg=atarg,
                         vtarg=tdlamret,
                         atarg_novel=atarg_novel,
                         vtarg_novel=tdlamret_novel),
                    shuffle=not pi.recurrent)

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        same_update_direction = []  # True
        task_gradient_mag = []
        novel_gradient_mag = []
        task_gradients = []
        novel_gradients = []
        same_dir_cnt = 0
        oppo_dir_cnt = 0
        # Here we do a bunch of optimization epochs over the data

        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)

                *newlosses_novel, g_novel = lossandgrad_novel(
                    batch["ob"], batch["ac"], batch["atarg_novel"],
                    batch["vtarg_novel"], cur_lrmult)

                pol_g = g[0:policy_var_count]
                pol_g_novel = g_novel[0:policy_var_count]

                comm = MPI.COMM_WORLD

                pol_g_reduced = np.zeros_like(pol_g)
                pol_g_novel_reduced = np.zeros_like(pol_g_novel)

                comm.Allreduce(pol_g, pol_g_reduced, op=MPI.SUM)

                pol_g_reduced /= comm.Get_size()

                comm.Allreduce(pol_g_novel, pol_g_novel_reduced, op=MPI.SUM)
                pol_g_novel_reduced /= comm.Get_size()

                final_gradient = np.zeros(
                    len(g) + len(g_novel) - policy_var_count)
                final_gradient[policy_var_count::] = np.concatenate(
                    (g[policy_var_count::], g_novel[policy_var_count::]))

                # pol_g_normalized = pol_g / np.linalg.norm(pol_g)
                # pol_g_novel_normalized = pol_g_novel / np.linalg.norm(pol_g_novel)

                pol_g_reduced_no_noise = pol_g_reduced[:(len(pol_g_reduced) -
                                                         noise_count)]

                pol_g_novel_reduced_no_noise = pol_g_novel_reduced[:(
                    len(pol_g_novel_reduced) - noise_count)]

                pol_g_reduced_no_noise_normalized = pol_g_reduced_no_noise / np.linalg.norm(
                    pol_g_reduced_no_noise)
                pol_g_novel_reduced_no_noise_normalized = pol_g_novel_reduced_no_noise / np.linalg.norm(
                    pol_g_novel_reduced_no_noise)

                dot = np.dot(pol_g_reduced_no_noise_normalized,
                             pol_g_novel_reduced_no_noise_normalized)

                task_gradients.append(pol_g_reduced_no_noise)
                novel_gradients.append(pol_g_novel_reduced_no_noise)

                task_gradient_mag.append(
                    np.linalg.norm(pol_g_reduced_no_noise))
                novel_gradient_mag.append(
                    np.linalg.norm(pol_g_novel_reduced_no_noise))

                same_update_direction.append(dot)

                # pol_g_normalized = pol_g_reduced_normalized
                # pol_g_novel_normalized = pol_g_novel_reduced_normalized

                pol_g_reduced_normalized = pol_g_reduced / np.linalg.norm(
                    pol_g_reduced)
                pol_g_novel_reduced_normalized = pol_g_novel_reduced / np.linalg.norm(
                    pol_g_novel_reduced)

                if (dot > 0):
                    same_dir_cnt += 1
                    bisector_no_noise = (pol_g_reduced_normalized +
                                         pol_g_novel_reduced_normalized)
                    bisector_no_noise_normalized = bisector_no_noise / np.linalg.norm(
                        bisector_no_noise)
                    # quarterSector_no_noise = (pol_g_reduced_normalized + bisector_no_noise_normalized)
                    # quarterSector_no_noise_normalized = quarterSector_no_noise / np.linalg.norm(quarterSector_no_noise)
                    #
                    # octSector_no_noise = (pol_g_reduced_normalized + quarterSector_no_noise_normalized)
                    # octSector_no_noise_normalized = octSector_no_noise / np.linalg.norm(octSector_no_noise)
                    target_dir = bisector_no_noise_normalized

                    final_gradient[0:policy_var_count] = 0.5 * (
                        np.dot(pol_g_reduced, target_dir) +
                        np.dot(pol_g_novel_reduced, target_dir)) * target_dir

                    adam_all.update(final_gradient,
                                    optim_stepsize * cur_lrmult)
                else:
                    oppo_dir_cnt += 1
                    task_projection_no_noise = np.dot(
                        pol_g_reduced, pol_g_novel_reduced_normalized
                    ) * pol_g_novel_reduced_normalized

                    final_pol_gradient_no_noise = pol_g_reduced - task_projection_no_noise

                    final_gradient[
                        0:policy_var_count] = final_pol_gradient_no_noise

                    adam_all.update(final_gradient,
                                    optim_stepsize * cur_lrmult)

                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            # newlosses_novel = compute_losses_novel(batch["ob"], batch["ac"], batch["atarg_novel"], batch["vtarg_novel"],
            #                                        cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg['ep_rets_novel']
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, rews_novel = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        rewnovelbuffer.extend(rews_novel)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpRNoveltyRewMean", np.mean(rewnovelbuffer))

        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        if iters_so_far >= novelty_start_iter and iters_so_far % novelty_update_iter_cycle == 0:
            novelty_update = not novelty_update

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        logger.record_tabular("RelativeDirection",
                              np.array(same_update_direction).mean())
        logger.record_tabular("SameDirectionCount", same_dir_cnt)
        logger.record_tabular("OppoDirectionCount", oppo_dir_cnt)
        logger.record_tabular("TaskGradMag",
                              np.array(task_gradient_mag).mean())
        logger.record_tabular("NoveltyGradMag",
                              np.array(novel_gradient_mag).mean())

        task_gradients = np.array(task_gradients).mean(axis=0)
        total_task_gradients.append(task_gradients)

        novel_gradients = np.array(novel_gradients).mean(axis=0)
        total_novelty_gradients.append(novel_gradients)

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    if MPI.COMM_WORLD.Get_rank() == 0:
        gradient_info = {}

        gradient_info['task_gradients'] = np.array(total_task_gradients)
        gradient_info['novelty_gradients'] = np.array(total_novelty_gradients)
        print(np.array(total_task_gradients).shape)
        print(np.array(total_novelty_gradients).shape)

        joblib.dump(gradient_info,
                    logger.get_dir() + '/gradientinfo.pkl',
                    compress=True)

    return pi
コード例 #11
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        sym_loss_weight=0.0,
        return_threshold=None,  # termiante learning if reaches return_threshold
        op_after_init=None,
        init_policy_params=None,
        policy_scope=None,
        max_threshold=None,
        positive_rew_enforce=False,
        reward_drop_bound=None,
        min_iters=0,
        ref_policy_params=None,
        rollout_length_thershold=None):

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    if policy_scope is None:
        pi = policy_func("pi", ob_space,
                         ac_space)  # Construct network for new policy
        oldpi = policy_func("oldpi", ob_space,
                            ac_space)  # Network for old policy
    else:
        pi = policy_func(policy_scope, ob_space,
                         ac_space)  # Construct network for new policy
        oldpi = policy_func("old" + policy_scope, ob_space,
                            ac_space)  # Network for old policy

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    sym_loss = sym_loss_weight * U.mean(
        tf.square(pi.mean - pi.mirrored_mean))  # mirror symmetric loss
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2)) + sym_loss  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent, sym_loss]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent", "sym_loss"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()

    if init_policy_params is not None:
        cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name.
                                               find('/')]
        orig_scope = list(init_policy_params.keys()
                          )[0][0:list(init_policy_params.keys())[0].find('/')]
        for i in range(len(pi.get_variables())):
            assign_op = pi.get_variables()[i].assign(
                init_policy_params[pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)
            assign_op = oldpi.get_variables()[i].assign(
                init_policy_params[pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)

    if ref_policy_params is not None:
        ref_pi = policy_func("ref_pi", ob_space, ac_space)
        cur_scope = ref_pi.get_variables()[0].name[0:ref_pi.get_variables()[0].
                                                   name.find('/')]
        orig_scope = list(ref_policy_params.keys()
                          )[0][0:list(ref_policy_params.keys())[0].find('/')]
        for i in range(len(ref_pi.get_variables())):
            assign_op = ref_pi.get_variables()[i].assign(
                ref_policy_params[ref_pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)
        env.env.env.ref_policy = ref_pi

    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    max_thres_satisfied = max_threshold is None
    adjust_ratio = 0.0
    prev_avg_rew = -1000000
    revert_parameters = {}
    variables = pi.get_variables()
    for i in range(len(variables)):
        cur_val = variables[i].eval()
        revert_parameters[variables[i].name] = cur_val
    revert_data = [0, 0, 0]
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        if reward_drop_bound is not None:
            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(flatten_lists, zip(*listoflrpairs))
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
            revert_iteration = False
            if np.mean(
                    rewbuffer
            ) < prev_avg_rew - reward_drop_bound:  # detect significant drop in performance, revert to previous iteration
                print("Revert Iteration!!!!!")
                revert_iteration = True
            else:
                prev_avg_rew = np.mean(rewbuffer)
            logger.record_tabular("Revert Rew", prev_avg_rew)
            if revert_iteration:  # revert iteration
                for i in range(len(pi.get_variables())):
                    assign_op = pi.get_variables()[i].assign(
                        revert_parameters[pi.get_variables()[i].name])
                    U.get_session().run(assign_op)
                episodes_so_far = revert_data[0]
                timesteps_so_far = revert_data[1]
                iters_so_far = revert_data[2]
                continue
            else:
                variables = pi.get_variables()
                for i in range(len(variables)):
                    cur_val = variables[i].eval()
                    revert_parameters[variables[i].name] = np.copy(cur_val)
                revert_data[0] = episodes_so_far
                revert_data[1] = timesteps_so_far
                revert_data[2] = iters_so_far

        if positive_rew_enforce:
            rewlocal = (seg["pos_rews"], seg["neg_pens"], seg["rew"]
                        )  # local values
            listofrews = MPI.COMM_WORLD.allgather(rewlocal)  # list of tuples
            pos_rews, neg_pens, rews = map(flatten_lists, zip(*listofrews))
            if np.mean(rews) < 0.0:
                #min_id = np.argmin(rews)
                #adjust_ratio = pos_rews[min_id]/np.abs(neg_pens[min_id])
                adjust_ratio = np.max([
                    adjust_ratio,
                    np.mean(pos_rews) / np.abs(np.mean(neg_pens))
                ])
                for i in range(len(seg["rew"])):
                    if np.abs(seg["rew"][i] - seg["pos_rews"][i] -
                              seg["neg_pens"][i]) > 1e-5:
                        print(seg["rew"][i], seg["pos_rews"][i],
                              seg["neg_pens"][i])
                        print('Reward wrong!')
                        abc
                    seg["rew"][i] = seg["pos_rews"][
                        i] + seg["neg_pens"][i] * adjust_ratio
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))
        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        if reward_drop_bound is None:
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        logger.record_tabular("Iter", iters_so_far)
        if positive_rew_enforce:
            if adjust_ratio is not None:
                logger.record_tabular("RewardAdjustRatio", adjust_ratio)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        if max_threshold is not None:
            print('Current max return: ', np.max(rewbuffer))
            if np.max(rewbuffer) > max_threshold:
                max_thres_satisfied = True
            else:
                max_thres_satisfied = False

        return_threshold_satisfied = True
        if return_threshold is not None:
            if not (np.mean(rewbuffer) > return_threshold
                    and iters_so_far > min_iters):
                return_threshold_satisfied = False
        rollout_length_thershold_satisfied = True
        if rollout_length_thershold is not None:
            rewlocal = (seg["avg_vels"], seg["rew"])  # local values
            listofrews = MPI.COMM_WORLD.allgather(rewlocal)  # list of tuples
            avg_vels, rews = map(flatten_lists, zip(*listofrews))
            if not (np.mean(lenbuffer) > rollout_length_thershold
                    and np.mean(avg_vels) > 0.5 * env.env.env.final_tv):
                rollout_length_thershold_satisfied = False
        if rollout_length_thershold is not None or return_threshold is not None:
            if rollout_length_thershold_satisfied and return_threshold_satisfied:
                break

    return pi, np.mean(rewbuffer)
コード例 #12
0
                o, r, d, _ = policy.act(random=True)
                length += 1
                collected_data_size += 1

                if d:
                    lengths.append(length)
                    break
        print('Average rollout length: ', np.mean(lengths))

        dataset = Dataset(dict(X=np.array(np.array(input_data)),
                               Y=np.array(np.array(output_data))),
                          shuffle=True)
        losses = []
        for iter in range(300):
            loss_epoch = []
            for batch in dataset.iterate_once(64):
                batch["X"] = torch.tensor(batch["X"],
                                          dtype=torch.float32,
                                          device=device)
                batch["Y"] = torch.tensor(batch["Y"],
                                          dtype=torch.float32,
                                          device=device)
                optimizer.zero_grad()
                outputs = osi(batch["X"])
                loss = criterion(outputs, batch["Y"])
                loss.backward()
                optimizer.step()
                loss_epoch.append(float(loss))
            losses.append(np.mean(loss_epoch))
            if iter % 5 == 0:
                print('iter: ', iter, 'loss: ', np.mean(loss_epoch))
コード例 #13
0
ファイル: pposgd_simple.py プロジェクト: shawnlewis/baselines
def learn(
        env,
        policy_func,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        load_model=None,
        action_bias=0.4,
        action_repeat=0,
        action_repeat_rand=False,
        warmup_frames=0,
        target_kl=0.01,
        vf_loss_mult=1,
        vfloss_optim_stepsize=0.003,
        vfloss_optim_batchsize=8,
        vfloss_optim_epochs=10):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    # Not sure why they anneal clip and learning rate with the same parameter
    #clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen
    losses = [pol_surr, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    lossandgrad_vfloss = U.function([ob, ac, atarg, ret], [vf_loss] +
                                    [U.flatgrad(vf_loss, var_list)])
    adam_vfloss = MpiAdam(var_list, epsilon=adam_epsilon)
    compute_vfloss = U.function([ob, ac, atarg, ret], [vf_loss])

    U.initialize()
    adam.sync()
    adam_vfloss.sync()

    if load_model:
        logger.log('Loading model: %s' % load_model)
        pi.load(load_model)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     action_bias=action_bias,
                                     action_repeat=action_repeat,
                                     action_repeat_rand=action_repeat_rand,
                                     warmup_frames=warmup_frames)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    ep_rew_file = None
    if MPI.COMM_WORLD.Get_rank() == 0:
        import wandb
        ep_rew_file = open(
            os.path.join(wandb.run.dir, 'episode_rewards.jsonl'), 'w')
        checkpoint_dir = 'checkpoints-%s' % wandb.run.id
        os.mkdir(checkpoint_dir)

    cur_lrmult = 1.0
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        elif schedule == 'target_kl':
            pass
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.next()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                result = lossandgrad(batch["ob"], batch["ac"], batch["atarg"],
                                     batch["vtarg"], cur_lrmult)
                newlosses = result[:-1]
                g = result[-1]
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        # vfloss optimize
        logger.log("Optimizing value function...")
        logger.log(fmt_row(13, ['vf']))
        for _ in range(vfloss_optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(vfloss_optim_batchsize):
                result = lossandgrad_vfloss(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"])
                newlosses = result[:-1]
                g = result[-1]
                adam_vfloss.update(g, vfloss_optim_stepsize)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            newlosses += compute_vfloss(batch["ob"], batch["ac"],
                                        batch["atarg"], batch["vtarg"])
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names + ['vf']):
            logger.record_tabular("loss_" + name, lossval)
        # check kl
        if schedule == 'target_kl':
            if meanlosses[2] > target_kl * 1.1:
                cur_lrmult /= 1.5
            elif meanlosses[2] < target_kl / 1.1:
                cur_lrmult *= 1.5
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        if rewbuffer:
            logger.record_tabular('CurLrMult', cur_lrmult)
            logger.record_tabular('StepSize', optim_stepsize * cur_lrmult)
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMax", np.max(rewbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpRewMin", np.min(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            time_elapsed = time.time() - tstart
            logger.record_tabular("TimeElapsed", time_elapsed)
            if MPI.COMM_WORLD.Get_rank() == 0:
                import wandb
                ep_rew_file.write('%s\n' % json.dumps({
                    'TimeElapsed': time_elapsed,
                    'Rewards': rews
                }))
                ep_rew_file.flush()
                data = logger.Logger.CURRENT.name2val
                wandb.run.history.add(data)
                summary_data = {}
                for k, v in data.iteritems():
                    if 'Rew' in k:
                        summary_data[k] = v
                wandb.run.summary.update(summary_data)
                pi.save(
                    os.path.join(checkpoint_dir,
                                 'model-%s.ckpt' % (iters_so_far - 1)))

                logger.dump_tabular()
        else:
            logger.log('No episodes complete yet')
コード例 #14
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    load_model_path,
    test_only,
    stochastic,
    symmetric_training=False,
    obs_names=None,
    single_episode=False,
    horizon_hack=False,
    running_avg_len=100,
    init_three=False,
    actions=None,
    symmetric_training_trick=False,
    seeds_fn=None,
    bootstrap_seeds=False,
):
    global seeds
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)  # Network for new policy
    old_pi = policy_func("old_pi", ob_space,
                         ac_space)  # Network for old policy
    adv_targ = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    mask = tf.placeholder(dtype=tf.bool, shape=[None])  # Mask for the trick

    lr_mult = tf.placeholder(
        name='lr_mult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lr_mult  # Annealed clipping parameter epsilon

    ob = U.get_placeholder_cached(name="ob")
    st = U.get_placeholder_cached(name="st")
    ac = pi.pdtype.sample_placeholder([None])

    kl = old_pi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    mean_kl = U.mean(tf.boolean_mask(kl, mask))  # Mean over the batch
    mean_ent = U.mean(tf.boolean_mask(ent, mask))
    entropy_penalty = -entcoeff * mean_ent

    ratio = tf.exp(pi.pd.logp(ac) - old_pi.pd.logp(ac))  # pi_new / pi_old
    surr_1 = ratio * adv_targ  # surrogate from conservative policy iteration
    surr_2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * adv_targ  #
    surr_loss = -U.mean(tf.boolean_mask(
        tf.minimum(surr_1, surr_2),
        mask))  # PPO's pessimistic surrogate (L^CLIP), mean over the batch
    vf_loss = U.mean(tf.boolean_mask(tf.square(pi.vpred - ret), mask))
    total_loss = surr_loss + entropy_penalty + vf_loss
    losses = [surr_loss, entropy_penalty, vf_loss, mean_kl, mean_ent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    comp_loss_and_grad = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask],
                                    losses +
                                    [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(old_v, new_v)
            for (old_v,
                 new_v) in zipsame(old_pi.get_variables(), pi.get_variables())
        ])
    comp_loss = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask], losses)

    if init_three:
        assign_init_three_1 = U.function(
            [], [],
            updates=[
                tf.assign(new_v, old_v) for (old_v, new_v) in zipsame(
                    pi.get_orig_variables(), pi.get_part_variables(1))
            ])
        assign_init_three_2 = U.function(
            [], [],
            updates=[
                tf.assign(new_v, old_v) for (old_v, new_v) in zipsame(
                    pi.get_orig_variables(), pi.get_part_variables(2))
            ])

    U.initialize()
    if load_model_path is not None:
        U.load_state(load_model_path)
        if init_three:
            assign_init_three_1()
            assign_init_three_2()
    adam.sync()

    if seeds_fn is not None:
        with open(seeds_fn) as f:
            seeds = [int(seed) for seed in f.readlines()]
    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=stochastic,
                                     single_episode=test_only
                                     or single_episode,
                                     actions=actions,
                                     bootstrap_seeds=bootstrap_seeds)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    len_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for episode lengths
    rew_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for episode rewards
    origrew_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for original episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam, horizon_hack=horizon_hack)

        # ob, ac, adv_targ, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, st, ac, adv_targ, tdlamret = seg["ob"], seg["step"], seg[
            "ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        if symmetric_training_trick:
            first_75 = st < 75
            mask = ~np.concatenate((np.zeros_like(first_75), first_75))
        else:
            mask = np.concatenate(
                (np.ones_like(st,
                              dtype=np.bool), np.ones_like(st, dtype=np.bool)))
        if symmetric_training:
            sym_obss = []
            sym_acc = []
            for i in range(timesteps_per_batch):
                obs = OrderedDict(zip(obs_names, ob[i]))
                sym_obs = obs.copy()
                swap_legs(sym_obs)

                sym_ac = ac[i].copy()
                sym_ac = np.concatenate((sym_ac[9:], sym_ac[:9]))
                sym_obss.append(np.asarray(list(sym_obs.values())))
                sym_acc.append(sym_ac)
            sym_obss = np.asarray(sym_obss)
            sym_acc = np.asarray(sym_acc)

            ob = np.concatenate((ob, sym_obss))
            ac = np.concatenate((ac, sym_acc))
            adv_targ = np.concatenate((adv_targ, adv_targ))
            tdlamret = np.concatenate((tdlamret, tdlamret))
            vpredbefore = np.concatenate((vpredbefore, vpredbefore))
            st = np.concatenate((st, st))

        # Compute stats before updating
        if bootstrap_seeds:
            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"],
                       seg["easy_seeds"], seg["hard_seeds"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews, orig_rews, easy_seeds, hard_seeds = map(
                flatten_lists, zip(*listoflrpairs))
            easy_seeds = [x for x in easy_seeds if x != 0]
            hard_seeds = [x for x in hard_seeds if x != 0]
            print('seeds set sizes:', len(seeds), len(easy_seeds),
                  len(hard_seeds))
            seeds = list((set(seeds) - set(easy_seeds)) | set(hard_seeds))
        else:
            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"]
                       )  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews, orig_rews = map(flatten_lists, zip(*listoflrpairs))

        len_buffer.extend(lens)
        rew_buffer.extend(rews)
        origrew_buffer.extend(orig_rews)
        logger.record_tabular("Iter", iters_so_far)
        logger.record_tabular("EpLenMean", np.mean(len_buffer))
        logger.record_tabular("EpRewMean", np.mean(rew_buffer))
        logger.record_tabular("EpOrigRewMean", np.mean(origrew_buffer))
        logger.record_tabular("EpOrigRewStd", np.std(origrew_buffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        n_completed = 0
        sum_completed = 0
        for ep_len, orig_rew in zip(lens, orig_rews):
            if ep_len == 1000:
                n_completed += 1
                sum_completed += orig_rew
        avg_completed = sum_completed / n_completed if n_completed > 0 else 0
        logger.record_tabular("AvgCompleted", avg_completed)
        perc_completed = 100 * n_completed / len(lens) if len(lens) > 0 else 0
        logger.record_tabular("PercCompleted", perc_completed)

        if callback: callback(locals(), globals())

        adv_targ = (adv_targ - adv_targ.mean()) / adv_targ.std(
        )  # standardized advantage function estimate
        d = Dataset(dict(ob=ob,
                         st=st,
                         ac=ac,
                         atarg=adv_targ,
                         vtarg=tdlamret,
                         mask=mask),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        if not test_only:
            logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data. I log results only for the first worker (rank=0)
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *batch_losses, grads = comp_loss_and_grad(
                    batch["ob"], batch["st"], batch["ac"], batch["atarg"],
                    batch["vtarg"], cur_lrmult, batch["mask"])
                if not test_only:
                    adam.update(grads, optim_stepsize * cur_lrmult)
                losses.append(batch_losses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            batch_losses = comp_loss(batch["ob"], batch["st"], batch["ac"],
                                     batch["atarg"], batch["vtarg"],
                                     cur_lrmult, batch["mask"])
            losses.append(batch_losses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        iters_so_far += 1
コード例 #15
0
def ppo_learn(env, policy,
        timesteps_per_actorbatch,                       # timesteps per actor per update
        clip_param, entcoeff,                           # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,  # optimization hypers
        gamma, lam,                                     # advantage estimation
        args,
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        save_obs=False):

    # Setup losses and stuff
    # ----------------------------------------
    pi = policy
    oldpi = create_policy("oldpi", env)

    atarg = tf.placeholder(dtype=tf.float32, shape=[None])   # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])     # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult                                    # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))    # pnew / pold
    surr1 = ratio * atarg                                 # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))

    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]


    var_list = pi.get_trainable_variables()
    logger.log("trainable variables:", var_list)

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Initializing oldpi = pi.
    assign_old_eq_new()

    # Prepare for rollouts
    seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0

    ep_suc_so_far = 0              # success episodes num so far during training
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    ep_mean_rews = list()
    ep_mean_lens = list()

    eval_success_rates = list()   # this is for saving global info for multiple evaluation results.
    eval_suc_buffer = deque(maxlen=2)

    while True:
        if callback:
            callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        """ Learning rate scheduler """
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            # cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
            cur_lrmult = 1.0
            cur_lrmult = max(cur_lrmult * np.power(0.95, float(iters_so_far) / max_iters), 0.7)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % (iters_so_far+1))  # Current iteration index

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        rews = seg['rew']
        ep_rets = seg['ep_rets']
        train_sucs = seg['suc']
        mc_rets = seg['mcreturn']
        vpredbefore = seg['vpred']
        tdtarget = seg['tdtarget']

        """ In case of collecting real-time sim data and its vpred for further debugging """
        sim_data_name = 'sim_data'
        with open(args['RUN_DIR'] + '/' + sim_data_name + '.csv', 'a') as f:
            vpred_shaped = vpredbefore.reshape(-1, 1)
            atarg_shaped = atarg.reshape(-1,1)
            tdlamret_shaped = tdlamret.reshape(-1,1)
            tdtarget_shaped = tdtarget.reshape(-1,1)
            rews_shaped = rews.reshape(-1,1)

            log_data = np.concatenate((ob, vpred_shaped, atarg_shaped, tdlamret_shaped, tdtarget_shaped, rews_shaped), axis=1)

            if args['gym_env'] == 'QuadTakeOffHoverEnv-v0':
                log_df = pd.DataFrame(log_data, columns=['z', 'vx', 'vy', 'vz', 'roll', 'pitch', 'yaw', 'roll_w', 'pitch_w', 'yaw_w', 'vpred', 'atarg', 'tdlamret', 'tdtarget','rews'])
            else:
                raise ValueError("invalid env !!!")
            log_df.to_csv(f, header=True)

        """ Optimization """
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        # update pi.ob_rms based on the most recent ob
        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values

        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))

        # Here we do a bunch of optimization epochs over the data
        kl_threshold = 0.05
        for _ in range(optim_epochs):
            losses = []  # list of sublists, each of which gives the loss based on a set of samples with size "optim_batchsize"
            grads = []   # list of sublists, each of which gives the gradients w.r.t all variables based on a set of samples with size "optim_batchsize"
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                if any(np.isnan(g)):
                    logger.log("there are nan in gradients, skip further updating!")
                    break
                if newlosses[3] < kl_threshold:
                    adam.update(g, optim_stepsize * cur_lrmult)
                else:
                    logger.log("KL loss is %f larger than kl_threshold %f, early stop further updating!" % (newlosses[3], kl_threshold))
                    break  # break only jump out of the inner loop
                grads.append(g)
                losses.append(newlosses)

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)

        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])         # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        ep_mean_lens.append(np.mean(lenbuffer))
        ep_mean_rews.append(np.mean(rewbuffer))

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        logger.record_tabular("EpRewMeanThisIter", np.mean(seg["ep_rets"]))
        logger.record_tabular("EpSuccessThisIter", Counter(train_sucs)[True])
        logger.record_tabular("SucRateThisIter", Counter(train_sucs)[True] / len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        ep_suc_so_far += Counter(train_sucs)[True]
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("EpSuccessSoFar", ep_suc_so_far)
        logger.record_tabular("SucRateSoFar", ep_suc_so_far/episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

        """ Evaluation """
        EVALUATION_FREQUENCY = 10  # 10
        if iters_so_far % EVALUATION_FREQUENCY == 0:

            eval_max_iters = 5
            eval_iters_so_far = 0
            eval_timesteps_per_actorbatch = timesteps_per_actorbatch

            eval_lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
            eval_rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

            eval_episodes_so_far = 0
            eval_timesteps_so_far = 0
            eval_success_episodes_so_far = 0

            # prepare eval episode generator
            eval_seg_gen = traj_segment_generator(pi, env, eval_timesteps_per_actorbatch, stochastic=False)

            logger.log("********** Start evaluating ... ************")
            while True:
                if eval_max_iters and eval_iters_so_far >= eval_max_iters:
                    break

                logger.log("********** Eval Iteration %i ************" %(eval_iters_so_far+1))

                eval_seg = eval_seg_gen.__next__()

                eval_lrlocal = (eval_seg["ep_lens"], eval_seg["ep_rets"])    # local values
                eval_listoflrpairs = MPI.COMM_WORLD.allgather(eval_lrlocal)  # list of tuples
                eval_lens, eval_rews = map(flatten_lists, zip(*eval_listoflrpairs))
                eval_lenbuffer.extend(eval_lens)
                eval_rewbuffer.extend(eval_rews)
                logger.record_tabular("EpLenMean", np.mean(eval_lenbuffer))
                logger.record_tabular("EpRewMean", np.mean(eval_rewbuffer))
                logger.record_tabular("EpThisIter", len(eval_lens))
                eval_sucs = eval_seg["suc"]
                logger.record_tabular("EpSuccessThisIter", Counter(eval_sucs)[True])


                eval_episodes_so_far += len(eval_lens)
                eval_timesteps_so_far += sum(eval_lens)
                eval_success_episodes_so_far += Counter(eval_sucs)[True]
                logger.record_tabular("EpisodesSoFar", eval_episodes_so_far)
                logger.record_tabular("TimestepsSoFar", eval_timesteps_so_far)
                logger.record_tabular("EpisodesSuccessSoFar", eval_success_episodes_so_far)
                logger.record_tabular("SuccessRateSoFar", eval_success_episodes_so_far * 1.0 / eval_episodes_so_far)

                eval_iters_so_far += 1
                if MPI.COMM_WORLD.Get_rank() == 0:
                    logger.dump_tabular()
            # save success rate from each evaluation into global list
            eval_success_rates.append(eval_success_episodes_so_far * 1.0 / eval_episodes_so_far)
            eval_suc_buffer.append(eval_success_episodes_so_far * 1.0 / eval_episodes_so_far)

        """ Saving model and statistics """
        MODEL_SAVING_FREQ = 30  # 30 is enough for some learning
        if iters_so_far % MODEL_SAVING_FREQ == 0:
            pi.save_model(args['MODEL_DIR'], iteration=iters_so_far)

            # save necessary training statistics
            with open(args['RESULT_DIR'] + '/train_reward_' + 'iter_' + str(iters_so_far) + '.pkl', 'wb') as f_train:
                pickle.dump(ep_mean_rews, f_train)

            # save necessary evaluation statistics
            with open(args['RESULT_DIR'] + '/eval_success_rate_' + 'iter_' + str(iters_so_far) + '.pkl', 'wb') as f_eval:
                pickle.dump(eval_success_rates, f_eval)

        """ Plotting and saving statistics """
        PLOT_FREQUENCY = 10 # 10
        if iters_so_far % PLOT_FREQUENCY == 0:
            # plot training reward performance
            train_plot_x = np.arange(len(ep_mean_rews)) + 1
            train_plot_x = np.insert(train_plot_x, 0, 0)
            train_plot_y = np.insert(ep_mean_rews, 0, ep_mean_rews[0])
            plot_performance(x=train_plot_x, y=train_plot_y, ylabel=r'episode mean reward at each iteration',
                             xlabel='ppo iterations', figfile=os.path.join(args['FIGURE_DIR'], 'train_reward'), title='TRAIN')

            # plot evaluation success rate
            eval_plot_x = (np.arange(len(eval_success_rates)) + 1) * EVALUATION_FREQUENCY
            eval_plot_x = np.insert(eval_plot_x, 0, 0)
            eval_plot_y = np.insert(eval_success_rates, 0, 0)
            plot_performance(x=eval_plot_x, y = eval_plot_y,
                             ylabel=r'eval success rate',
                             xlabel='ppo iterations', figfile=os.path.join(args['FIGURE_DIR'], 'eval_success_rate'),
                             title="EVAL")
    return pi
コード例 #16
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        log_every=None,
        log_dir=None,
        episodes_so_far=0,
        timesteps_so_far=0,
        iters_so_far=0,
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        **kwargs):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    # Target advantage function (if applicable)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # learning rate multiplier, updated with schedule
    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    # GRASPING
    saver = tf.train.Saver(var_list=U.ALREADY_INITIALIZED, max_to_keep=1)
    checkpoint = tf.train.latest_checkpoint(log_dir)
    if checkpoint:
        print("Restoring checkpoint: {}".format(checkpoint))
        saver.restore(U.get_session(), checkpoint)
    if hasattr(env, "set_actor"):

        def actor(obs):
            return pi.act(False, obs)[0]

        env.set_actor(actor)
    if not checkpoint and hasattr(env, "warm_init_eps"):
        pretrain(pi, env)
        saver.save(U.get_session(), osp.join(log_dir, "model"))
    # /GRASPING
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    tstart = time.time()

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback:
            callback(locals(), globals())
        should_break = False
        if max_timesteps and timesteps_so_far >= max_timesteps:
            should_break = True
        elif max_episodes and episodes_so_far >= max_episodes:
            should_break = True
        elif max_iters and iters_so_far >= max_iters:
            should_break = True
        elif max_seconds and time.time() - tstart >= max_seconds:
            should_break = True

        if log_every and log_dir:
            if (iters_so_far + 1) % log_every == 0 or should_break:
                # To reduce space, don't specify global step.
                saver.save(U.get_session(), osp.join(log_dir, "model"))

            job_info = {
                'episodes_so_far': episodes_so_far,
                'iters_so_far': iters_so_far,
                'timesteps_so_far': timesteps_so_far
            }
            with open(osp.join(log_dir, "job_info_new.yaml"), 'w') as file:
                yaml.dump(job_info, file, default_flow_style=False)
                # Make sure write is instantaneous.
                file.flush()
                os.fsync(file)
            os.rename(osp.join(log_dir, "job_info_new.yaml"),
                      osp.join(log_dir, "job_info.yaml"))

        if should_break:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (
            atarg.std() + 1e-10)  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            # logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        logger.record_tabular("EpLenMean", np.mean(lens))
        logger.record_tabular("EpRewMean", np.mean(rews))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
コード例 #17
0
ファイル: algorithm.py プロジェクト: marctom/POTAIS
def learn(env,
          policy_fn,
          *,
          timesteps_per_actorbatch,
          clip_param,
          entcoeff,
          optim_epochs,
          optim_stepsize,
          optim_batchsize,
          gamma,
          lam,
          max_timesteps,
          alphas,
          schedule,
          return_mv_avg,
          adam_epsilon=1e-5):
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space)
    oldpi = policy_fn("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])
    new = tf.placeholder(dtype=tf.float32, shape=[None])
    ret = tf.placeholder(dtype=tf.float32, shape=[None])

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    log_ratio = pi.pd.logp(ac) - oldpi.pd.logp(ac)

    bool_new = tf.cast(new, tf.bool)

    def get_shift(i):
        shift = tf.concat([tf.zeros((i, )), log_ratio[:-i]], 0)
        shift = tf.where(bool_new, tf.zeros_like(shift), shift)
        for _ in range(1, i):
            shift = tf.where(
                tf.concat([tf.ones((i, ), dtype=tf.bool), bool_new[:-i]], 0),
                tf.zeros_like(shift), shift)
        return shift

    shifts = [log_ratio] + [get_shift(i) for i in range(1, len(alphas))]
    is_log_ratio = sum(b * s for b, s in zip(alphas, shifts))

    ratio = tf.exp(is_log_ratio)
    surr1 = ratio * atarg
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
    pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, new],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = AdamOptimizer(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, new], losses)

    U.initialize()

    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()

    lenbuffer = deque(maxlen=200)
    rewbuffer = deque(maxlen=return_mv_avg)
    results = []

    while True:
        if timesteps_so_far > max_timesteps:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError()

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]
        atarg = (atarg - atarg.mean()) / atarg.std()
        d = Dataset(
            dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, new=seg["new"]))
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)

        assign_old_eq_new()
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        for _ in range(optim_epochs):
            losses = []
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult, batch["new"])
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult, batch["new"])
            losses.append(newlosses)

        meanlosses = np.vstack(losses).mean(0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        listoflrpairs = [(seg["ep_lens"], seg["ep_rets"])]
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        results.append(np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("AvgEpisodeLen", np.mean(lenbuffer))
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed [h]", (time.time() - tstart) / 3600)
        logger.record_tabular("Timesteps/sec",
                              timesteps_so_far / (time.time() - tstart))
        logger.dump_tabular()
    return results
コード例 #18
0
def learn(
        args,
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        writer=None):
    print("\nBeginning learning...\n")

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.compat.v1.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.compat.v1.placeholder(dtype=tf.float32,
                                   shape=[None])  # Empirical return

    lrmult = tf.compat.v1.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = {}
    ob['adj'] = U.get_placeholder_cached(name="adj")
    ob['node'] = U.get_placeholder_cached(name="node")

    ob_gen = {}
    ob_gen['adj'] = U.get_placeholder(
        shape=[None, ob_space['adj'].shape[0], None, None],
        dtype=tf.float32,
        name='adj_gen')
    ob_gen['node'] = U.get_placeholder(
        shape=[None, 1, None, ob_space['node'].shape[2]],
        dtype=tf.float32,
        name='node_gen')

    ob_real = {}
    ob_real['adj'] = U.get_placeholder(
        shape=[None, ob_space['adj'].shape[0], None, None],
        dtype=tf.float32,
        name='adj_real')
    ob_real['node'] = U.get_placeholder(
        shape=[None, 1, None, ob_space['node'].shape[2]],
        dtype=tf.float32,
        name='node_real')

    ac = tf.compat.v1.placeholder(dtype=tf.int64,
                                  shape=[None, 4],
                                  name='ac_real')

    ## PPO loss
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    pi_logp = pi.pd.logp(ac)
    oldpi_logp = oldpi.pd.logp(ac)
    ratio_log = pi.pd.logp(ac) - oldpi.pd.logp(ac)

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    ## Expert loss
    loss_expert = -tf.reduce_mean(pi_logp)

    ## Discriminator loss
    step_pred_real, step_logit_real = discriminator_net(ob_real,
                                                        args,
                                                        name='d_step')
    step_pred_gen, step_logit_gen = discriminator_net(ob_gen,
                                                      args,
                                                      name='d_step')
    loss_d_step_real = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=step_logit_real,
            labels=tf.ones_like(step_logit_real) * 0.9))
    loss_d_step_gen = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen)))
    loss_d_step = loss_d_step_real + loss_d_step_gen
    if args.gan_type == 'normal':
        loss_g_step_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen)))
    elif args.gan_type == 'recommend':
        loss_g_step_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=step_logit_gen,
                labels=tf.ones_like(step_logit_gen) * 0.9))
    elif args.gan_type == 'wgan':
        loss_d_step, _, _ = discriminator(ob_real, ob_gen, args, name='d_step')
        loss_d_step = loss_d_step * -1
        loss_g_step_gen, _ = discriminator_net(ob_gen, args, name='d_step')

    final_pred_real, final_logit_real = discriminator_net(ob_real,
                                                          args,
                                                          name='d_final')
    final_pred_gen, final_logit_gen = discriminator_net(ob_gen,
                                                        args,
                                                        name='d_final')
    loss_d_final_real = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=final_logit_real,
            labels=tf.ones_like(final_logit_real) * 0.9))
    loss_d_final_gen = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen)))
    loss_d_final = loss_d_final_real + loss_d_final_gen
    if args.gan_type == 'normal':
        loss_g_final_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen)))
    elif args.gan_type == 'recommend':
        loss_g_final_gen = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=final_logit_gen,
                labels=tf.ones_like(final_logit_gen) * 0.9))
    elif args.gan_type == 'wgan':
        loss_d_final, _, _ = discriminator(ob_real,
                                           ob_gen,
                                           args,
                                           name='d_final')
        loss_d_final = loss_d_final * -1
        loss_g_final_gen, _ = discriminator_net(ob_gen, args, name='d_final')

    var_list_pi = pi.get_trainable_variables()
    var_list_pi_stop = [
        var for var in var_list_pi
        if ('emb' in var.name) or ('gcn' in var.name) or ('stop' in var.name)
    ]
    var_list_d_step = [
        var for var in tf.compat.v1.global_variables() if 'd_step' in var.name
    ]
    var_list_d_final = [
        var for var in tf.compat.v1.global_variables() if 'd_final' in var.name
    ]

    ## debug
    debug = {}

    ## loss update function
    lossandgrad_ppo = U.function([
        ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret,
        lrmult
    ], losses + [U.flatgrad(total_loss, var_list_pi)])
    lossandgrad_expert = U.function(
        [ob['adj'], ob['node'], ac, pi.ac_real],
        [loss_expert, U.flatgrad(loss_expert, var_list_pi)])
    lossandgrad_expert_stop = U.function(
        [ob['adj'], ob['node'], ac, pi.ac_real],
        [loss_expert, U.flatgrad(loss_expert, var_list_pi_stop)])
    lossandgrad_d_step = U.function(
        [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']],
        [loss_d_step, U.flatgrad(loss_d_step, var_list_d_step)])
    lossandgrad_d_final = U.function(
        [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']],
        [loss_d_final,
         U.flatgrad(loss_d_final, var_list_d_final)])
    loss_g_gen_step_func = U.function([ob_gen['adj'], ob_gen['node']],
                                      loss_g_step_gen)
    loss_g_gen_final_func = U.function([ob_gen['adj'], ob_gen['node']],
                                       loss_g_final_gen)

    adam_pi = MpiAdam(var_list_pi, epsilon=adam_epsilon)
    adam_pi_stop = MpiAdam(var_list_pi_stop, epsilon=adam_epsilon)
    adam_d_step = MpiAdam(var_list_d_step, epsilon=adam_epsilon)
    adam_d_final = MpiAdam(var_list_d_final, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.compat.v1.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    compute_losses = U.function([
        ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret,
        lrmult
    ], losses)

    # Prepare for rollouts
    # ----------------------------------------
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    lenbuffer_valid = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_env = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_d_step = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_d_final = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_final = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_final_stat = deque(
        maxlen=100)  # rolling buffer for episode rewardsn

    seg_gen = traj_segment_generator(args, pi, env, timesteps_per_actorbatch,
                                     True, loss_g_gen_step_func,
                                     loss_g_gen_final_func)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"
    if args.load == 1:
        try:
            fname = './ckpt/' + args.name_full_load
            sess = tf.get_default_session()
            # sess.run(tf.compat.v1.global_variables_initializer())
            saver = tf.train.Saver(var_list_pi)
            saver.restore(sess, fname)
            iters_so_far = int(fname.split('_')[-1]) + 1
            print('model restored!', fname, 'iters_so_far:', iters_so_far)
        except:
            print(fname, 'ckpt not found, start with iters 0')

    U.initialize()
    adam_pi.sync()
    adam_pi_stop.sync()
    adam_d_step.sync()
    adam_d_final.sync()

    counter = 0
    level = 0
    ## start training
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)
        ob_adj, ob_node, ac, atarg, tdlamret = seg["ob_adj"], seg[
            "ob_node"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob_adj=ob_adj,
                         ob_node=ob_node,
                         ac=ac,
                         atarg=atarg,
                         vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob_adj.shape[0]

        # inner training loop, train policy
        for i_optim in range(optim_epochs):

            loss_expert = 0
            loss_expert_stop = 0
            g_expert = 0
            g_expert_stop = 0

            loss_d_step = 0
            loss_d_final = 0
            g_ppo = 0
            g_d_step = 0
            g_d_final = 0

            pretrain_shift = 5
            ## Expert
            if iters_so_far >= args.expert_start and iters_so_far <= args.expert_end + pretrain_shift:
                ## Expert train
                # # # learn how to stop
                ob_expert, ac_expert = env.get_expert(optim_batchsize)
                loss_expert, g_expert = lossandgrad_expert(
                    ob_expert['adj'], ob_expert['node'], ac_expert, ac_expert)
                loss_expert = np.mean(loss_expert)

            ## PPO
            if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end:
                assign_old_eq_new(
                )  # set old parameter values to new parameter values
                batch = d.next_batch(optim_batchsize)
                # ppo
                if iters_so_far >= args.rl_start + pretrain_shift:  # start generator after discriminator trained a well..
                    *newlosses, g_ppo = lossandgrad_ppo(
                        batch["ob_adj"], batch["ob_node"], batch["ac"],
                        batch["ac"], batch["ac"], batch["atarg"],
                        batch["vtarg"], cur_lrmult)
                    losses_ppo = newlosses

                if args.has_d_step == 1 and i_optim >= optim_epochs // 2:
                    # update step discriminator
                    ob_expert, _ = env.get_expert(
                        optim_batchsize,
                        curriculum=args.curriculum,
                        evel_total=args.curriculum_num,
                        evel=level)
                    loss_d_step, g_d_step = lossandgrad_d_step(
                        ob_expert["adj"], ob_expert["node"], batch["ob_adj"],
                        batch["ob_node"])
                    adam_d_step.update(g_d_step, optim_stepsize * cur_lrmult)
                    loss_d_step = np.mean(loss_d_step)

                if args.has_d_final == 1 and i_optim >= optim_epochs // 4 * 3:
                    # update final discriminator
                    ob_expert, _ = env.get_expert(
                        optim_batchsize,
                        is_final=True,
                        curriculum=args.curriculum,
                        level_total=args.curriculum_num,
                        level=level)
                    seg_final_adj, seg_final_node = traj_final_generator(
                        pi, copy.deepcopy(env), optim_batchsize, True)
                    # update final discriminator
                    loss_d_final, g_d_final = lossandgrad_d_final(
                        ob_expert["adj"], ob_expert["node"], seg_final_adj,
                        seg_final_node)
                    adam_d_final.update(g_d_final, optim_stepsize * cur_lrmult)

            # update generator
            adam_pi.update(0.2 * g_ppo + 0.05 * g_expert,
                           optim_stepsize * cur_lrmult)

        # WGAN
        # if args.has_d_step == 1:
        #     clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_step]
        # if args.has_d_final == 1:
        #     clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_final]
        #

        ## PPO val
        # if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end:
        # logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob_adj"], batch["ob_node"],
                                       batch["ac"], batch["ac"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        # logger.log(fmt_row(13, meanlosses))

        if writer is not None:
            writer.add_scalar("loss_expert", loss_expert, iters_so_far)
            writer.add_scalar("loss_expert_stop", loss_expert_stop,
                              iters_so_far)
            writer.add_scalar("loss_d_step", loss_d_step, iters_so_far)
            writer.add_scalar("loss_d_final", loss_d_final, iters_so_far)
            writer.add_scalar('grad_expert_min', np.amin(g_expert),
                              iters_so_far)
            writer.add_scalar('grad_expert_max', np.amax(g_expert),
                              iters_so_far)
            writer.add_scalar('grad_expert_norm', np.linalg.norm(g_expert),
                              iters_so_far)
            writer.add_scalar('grad_expert_stop_min', np.amin(g_expert_stop),
                              iters_so_far)
            writer.add_scalar('grad_expert_stop_max', np.amax(g_expert_stop),
                              iters_so_far)
            writer.add_scalar('grad_expert_stop_norm',
                              np.linalg.norm(g_expert_stop), iters_so_far)
            writer.add_scalar('grad_rl_min', np.amin(g_ppo), iters_so_far)
            writer.add_scalar('grad_rl_max', np.amax(g_ppo), iters_so_far)
            writer.add_scalar('grad_rl_norm', np.linalg.norm(g_ppo),
                              iters_so_far)
            writer.add_scalar('g_d_step_min', np.amin(g_d_step), iters_so_far)
            writer.add_scalar('g_d_step_max', np.amax(g_d_step), iters_so_far)
            writer.add_scalar('g_d_step_norm', np.linalg.norm(g_d_step),
                              iters_so_far)
            writer.add_scalar('g_d_final_min', np.amin(g_d_final),
                              iters_so_far)
            writer.add_scalar('g_d_final_max', np.amax(g_d_final),
                              iters_so_far)
            writer.add_scalar('g_d_final_norm', np.linalg.norm(g_d_final),
                              iters_so_far)
            writer.add_scalar('learning_rate', optim_stepsize * cur_lrmult,
                              iters_so_far)

        for (lossval, name) in zipsame(meanlosses, loss_names):
            # logger.record_tabular("loss_"+name, lossval)
            if writer is not None:
                writer.add_scalar("loss_" + name, lossval, iters_so_far)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        if writer is not None:
            writer.add_scalar("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret),
                              iters_so_far)
        lrlocal = (seg["ep_lens"], seg["ep_lens_valid"], seg["ep_rets"],
                   seg["ep_rets_env"], seg["ep_rets_d_step"],
                   seg["ep_rets_d_final"], seg["ep_final_rew"],
                   seg["ep_final_rew_stat"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, lens_valid, rews, rews_env, rews_d_step, rews_d_final, rews_final, rews_final_stat = map(
            flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        lenbuffer_valid.extend(lens_valid)
        rewbuffer.extend(rews)
        rewbuffer_d_step.extend(rews_d_step)
        rewbuffer_d_final.extend(rews_d_final)
        rewbuffer_env.extend(rews_env)
        rewbuffer_final.extend(rews_final)
        rewbuffer_final_stat.extend(rews_final_stat)
        # logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        # logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))
        if writer is not None:
            writer.add_scalar("EpLenMean", np.mean(lenbuffer), iters_so_far)
            writer.add_scalar("EpLenValidMean", np.mean(lenbuffer_valid),
                              iters_so_far)
            writer.add_scalar("EpRewMean", np.mean(rewbuffer), iters_so_far)
            writer.add_scalar("EpRewDStepMean", np.mean(rewbuffer_d_step),
                              iters_so_far)
            writer.add_scalar("EpRewDFinalMean", np.mean(rewbuffer_d_final),
                              iters_so_far)
            writer.add_scalar("EpRewEnvMean", np.mean(rewbuffer_env),
                              iters_so_far)
            writer.add_scalar("EpRewFinalMean", np.mean(rewbuffer_final),
                              iters_so_far)
            writer.add_scalar("EpRewFinalStatMean",
                              np.mean(rewbuffer_final_stat), iters_so_far)
            writer.add_scalar("EpThisIter", len(lens), iters_so_far)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        # logger.record_tabular("EpisodesSoFar", episodes_so_far)
        # logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        # logger.record_tabular("TimeElapsed", time.time() - tstart)
        if writer is not None:
            writer.add_scalar("EpisodesSoFar", episodes_so_far, iters_so_far)
            writer.add_scalar("TimestepsSoFar", timesteps_so_far, iters_so_far)
            writer.add_scalar("TimeElapsed",
                              time.time() - tstart, iters_so_far)

        if MPI.COMM_WORLD.Get_rank() == 0:
            with open('molecule_gen/' + args.name_full + '.csv', 'a') as f:
                f.write('***** Iteration {} *****\n'.format(iters_so_far))
            # save
            if iters_so_far % args.save_every == 0:
                fname = './ckpt/' + args.name_full + '_' + str(iters_so_far)
                saver = tf.compat.v1.train.Saver(var_list_pi)
                saver.save(tf.compat.v1.get_default_session(), fname)
                print('model saved!', fname)
                # fname = os.path.join(ckpt_dir, task_name)
                # os.makedirs(os.path.dirname(fname), exist_ok=True)
                # saver = tf.train.Saver()
                # saver.save(tf.get_default_session(), fname)
            # if iters_so_far==args.load_step:
        iters_so_far += 1
        counter += 1
        if counter % args.curriculum_step and counter // args.curriculum_step < args.curriculum_num:
            level += 1
コード例 #19
0
def learn(
    env,
    pi,
    oldpi,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    test_envs=[
    ]  # can add a list of test environment to collect rewards if needed
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    #pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    #oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])

    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)
    if test_envs:
        test_gens = []
        for test_env in test_envs:
            test_gen = traj_segment_generator(pi,
                                              test_env,
                                              timesteps_per_batch,
                                              stochastic=True)
            test_gens.append(test_gen)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=50)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=50)  # rolling buffer for episode rewards
    test_rewbuffers = [deque(maxlen=50) for test_env in test_envs]

    # Maithra edits: add lists to return logs
    ep_lengths = []
    ep_rewards = []
    ep_labels = []
    ep_actions = []
    ep_correct_actions = []
    ep_obs = []
    # log results for test environment
    ep_rewards_test = [[] for test_env in test_envs]

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        if test_envs:
            segs_test = []
            for test_gen in test_gens:
                segs_test.append(test_gen.__next__())
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret, label = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"], seg["label"]
        if test_envs:
            for i, seg_test in enumerate(segs_test):
                test_rews = seg_test["ep_rets"]
                test_rewbuffers[i].extend(test_rews)
                ep_rewards_test[i].append(np.mean(test_rewbuffers[i]))

        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))

        # Maithra edit: append intermediate results onto returned logs
        ep_lengths.append(np.mean(lenbuffer))
        ep_rewards.append(np.mean(rewbuffer))
        ep_labels.append(deepcopy(label))
        ep_actions.append(deepcopy(ac))
        ep_obs.append(deepcopy(ob))
        # compute mean of correct actions and append, ignoring actions
        # where either choice could be right
        count = 0
        idxs = np.all((label == [1, 1]), axis=1)
        # removing for now: count += np.sum(idxs)
        new_label = label[np.invert(idxs)]
        new_ac = ac[np.invert(idxs)]
        count += np.sum((new_ac == np.argmax(new_label, axis=1)))
        # changing ep_correct_actions.append(count/len(label))
        ep_correct_actions.append(count / (len(label) - np.sum(idxs)))

        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    #Maithra edit
    return pi, {
        "lengths": ep_lengths,
        "rewards": ep_rewards,
        "labels": ep_labels,
        "actions": ep_actions,
        "correct_actions": ep_correct_actions,
        "obs": ep_obs,
        "test_rews": ep_rewards_test
    }
コード例 #20
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        identifier,
        save_result=True,
        save_interval=100,
        reward_list=[],
        cont=False,
        play=False,
        iter,
        action_repeat=1):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    mirror = hasattr(env, 'mirror_id')
    mirror_id = env.mirror_id if mirror else None
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    if mirror:
        mirror_ob = U.get_placeholder(name="mirror_ob",
                                      dtype=tf.float32,
                                      shape=[None] + list(ob_space.shape))
        mirror_ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    sym_loss = 4 * tf.reduce_mean(tf.square(ac - mirror_ac)) if mirror else 0
    total_loss = pol_surr + pol_entpen + vf_loss + sym_loss

    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
    if mirror:
        losses.append(sym_loss)
        loss_names.append("sym_loss")

    var_list = pi.get_trainable_variables()
    inputs = [ob, ac, atarg, ret, lrmult]
    if mirror:
        inputs += [mirror_ob, mirror_ac]
    lossandgrad = U.function(inputs,
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function(inputs, losses)

    if play:
        return pi

    if cont:
        load_state(identifier, iter)
    else:
        U.initialize()
        iter = 0
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     mirror_id=mirror_id,
                                     action_repeat=action_repeat)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = int(iter)
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    rewbuffer_ori = deque(maxlen=100)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        if mirror:
            mirror_ob, mirror_ac = seg["mirror_ob"], seg["mirror_ac"]

        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d_dict = dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret)
        if mirror:
            d_dict["mirror_ob"] = mirror_ob
            d_dict["mirror_ac"] = mirror_ac
        d = Dataset(d_dict, shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                batches = [
                    batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
                    cur_lrmult
                ]
                if mirror:
                    batches += [batch["mirror_ob"], batch["mirror_ac"]]
                *newlosses, g = lossandgrad(*batches)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

        losses = []
        for batch in d.iterate_once(optim_batchsize):
            batches = [
                batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
                cur_lrmult
            ]
            if mirror:
                batches += [batch["mirror_ob"], batch["mirror_ac"]]
            newlosses = compute_losses(*batches)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)

        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_rets_ori"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, rews_ori = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        rewbuffer_ori.extend(rews_ori)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpRewOriMean", np.mean(rewbuffer_ori))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

            reward_list.append(np.mean(rewbuffer_ori))
            if save_result and iters_so_far % save_interval == 0:
                save_state(identifier, iters_so_far)
                save_rewards(reward_list, identifier, iters_so_far)
                logger.log('Model and reward saved')

    return pi
コード例 #21
0
def learn(
    # =========== modified part begins =========== #
    env_id,
    seed,
    robot,  # robot class with GMM params
    joint_optimization_iters,  # total number of joint optimization iterations
    design_iters,  # number of samples when updating physical design in each joint optimization iteration
    policy_iters,  # number of samples when updating robot policy in each joint optimization iteration
    # ============ modified part ends ============ #
    policy_func,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):

    # ================================== modification 1 ================================== #
    """
    input:  replace "env" (env class) with "env_id" (string)
            add "seed" (int)
        reason: to enable env.make() during training
        modification detail: add following lines into learn()
            env = gym.make(env_id)
            env = bench.Monitor(env, logger.get_dir())
            env.seed(seed)
            env.close() # added at the end of learn()
    """
    import roboschool, gym
    from baselines import bench
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir())
    env.seed(seed)
    # ================================== modification 1 ================================== #

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    # policy_func is the initialization of NN
    # NN structure:
    #   state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value)
    #       num_hid_layers, hid_size: set in the file calls "learn"
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # placeholder for "ob"
    # created in mlppolicy.py
    ob = U.get_placeholder_cached(name="ob")
    # placeholder for "ac"
    # in common/distribution.py
    ac = pi.pdtype.sample_placeholder([None])

    # KL divergence and Entropy, defined in common/distribution.py
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)

    # pol_entpen: Entropy Bounus encourages exploration
    # entcoeff: entropy coefficient, defined in PPO page 5, Equ. (9)
    pol_entpen = (-entcoeff) * meanent

    # probability ration, defined in PPO page 3
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold

    # Surrogate Goal
    # defined in PPO page 3, Equ (7)
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    # Value Function Loss: square error loss for ||v_pred - v_target||
    vf_loss = U.mean(tf.square(pi.vpred - ret))

    # Total_loss = L^CLIP - Value Function Loss + Entropy Bounus
    # defined in PPO page 5, Equ. (9)
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    # adam optimizer?
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    # oldpi = pi
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    # Why we need this line?
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # ================================== modification 2 ================================== #
    for joint_optimization_iter in range(joint_optimization_iters):
        U.save_state('/home/yetong/Desktop/Project/models/model{}.ckpt'.format(
            joint_optimization_iter))
        logger.log("joint optimization progree: {}/{}".format(
            joint_optimization_iter, joint_optimization_iters))
        # ================================== update physical design ================================== #
        if joint_optimization_iter > 20:
            Rewards_plus = np.zeros(design_iters)
            Rewards_minum = np.zeros(design_iters)
            params = robot.sample(design_iters, to_update=True)
            for i, param in enumerate(params):
                robot.modify_file(param)
                env = gym.make(env_id)
                # myenv = env.env

                # pdb.set_trace()
                env = bench.Monitor(env, logger.get_dir())
                R = episode_generator(pi, env, gamma, stochastic=True)
                logger.log("\t update physical design: %d/%d, rew: %f" %
                           (i, 2 * design_iters, R))
                if i % 2 == 0:
                    Rewards_plus[int(i / 2)] = R
                else:
                    Rewards_minum[int(i / 2)] = R
            logger.log("prev_mu: ", robot.params_mu)
            logger.log("prev_sig: ", robot.params_sig)
            robot.update(Rewards_plus, Rewards_minum)
            logger.log("mu: ", robot.params_mu)
            logger.log("sig: ", robot.params_sig)
        # ================================== update policy ================================== #
        # params = robot.sample(design_iters)
        params = [robot.params_mu]
        for param in params:
            # reinitialize env
            robot.modify_file(param)
            env = gym.make(env_id)
            env = bench.Monitor(env, logger.get_dir())
            # ================================== modification 2 ================================== #

            # Prepare for rollouts
            # ----------------------------------------
            seg_gen = traj_segment_generator(pi,
                                             env,
                                             timesteps_per_actorbatch,
                                             stochastic=True)

            episodes_so_far = 0
            timesteps_so_far = 0
            iters_so_far = 0
            tstart = time.time()
            lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
            rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

            assert sum([
                max_iters > 0, max_timesteps > 0, max_episodes > 0,
                max_seconds > 0
            ]) == 1, "Only one time constraint permitted"

            while True:
                if callback: callback(locals(), globals())
                if max_timesteps and timesteps_so_far >= max_timesteps:
                    break
                elif max_episodes and episodes_so_far >= max_episodes:
                    break
                elif max_iters and iters_so_far >= max_iters:
                    break
                elif max_seconds and time.time() - tstart >= max_seconds:
                    break

                # annealing for stepsize parameters (epsilon and adam)
                if schedule == 'constant':
                    cur_lrmult = 1.0
                elif schedule == 'linear':
                    cur_lrmult = max(
                        1.0 - float(timesteps_so_far) / max_timesteps, 0)
                else:
                    raise NotImplementedError

                logger.log("********** Iteration %i ************" %
                           iters_so_far)

                seg = seg_gen.__next__()
                add_vtarg_and_adv(seg, gamma, lam)

                # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[
                    "adv"], seg["tdlamret"]
                vpredbefore = seg[
                    "vpred"]  # predicted value function before udpate
                atarg = (atarg - atarg.mean()) / atarg.std(
                )  # standardized advantage function estimate
                d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                            shuffle=not pi.recurrent)
                optim_batchsize = optim_batchsize or ob.shape[0]

                if hasattr(pi, "ob_rms"):
                    pi.ob_rms.update(ob)  # update running mean/std for policy

                # oldpi = pi
                # set old parameter values to new parameter values
                assign_old_eq_new()
                logger.log("Optimizing...")
                logger.log(fmt_row(13, loss_names))
                # Here we do a bunch of optimization epochs over the data
                for _ in range(optim_epochs):
                    losses = [
                    ]  # list of tuples, each of which gives the loss for a minibatch
                    for batch in d.iterate_once(optim_batchsize):
                        *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult)
                        adam.update(g, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)
                    logger.log(fmt_row(13, np.mean(losses, axis=0)))

                logger.log("Evaluating losses...")
                losses = []
                for batch in d.iterate_once(optim_batchsize):
                    newlosses = compute_losses(batch["ob"], batch["ac"],
                                               batch["atarg"], batch["vtarg"],
                                               cur_lrmult)
                    losses.append(newlosses)
                meanlosses, _, _ = mpi_moments(losses, axis=0)
                logger.log(fmt_row(13, meanlosses))
                for (lossval, name) in zipsame(meanlosses, loss_names):
                    logger.record_tabular("loss_" + name, lossval)
                logger.record_tabular(
                    "ev_tdlam_before",
                    explained_variance(vpredbefore, tdlamret))
                lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
                listoflrpairs = MPI.COMM_WORLD.allgather(
                    lrlocal)  # list of tuples
                lens, rews = map(flatten_lists, zip(*listoflrpairs))
                lenbuffer.extend(lens)
                rewbuffer.extend(rews)
                logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                logger.record_tabular("EpThisIter", len(lens))
                episodes_so_far += len(lens)
                timesteps_so_far += sum(lens)
                iters_so_far += 1
                logger.record_tabular("EpisodesSoFar", episodes_so_far)
                logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                logger.record_tabular("TimeElapsed", time.time() - tstart)
                if MPI.COMM_WORLD.Get_rank() == 0:
                    logger.dump_tabular()

    # ================================== modification 1 ================================== #
    env.close()
コード例 #22
0
def learn(
    env,
    genv,
    i_trial,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    gpi = policy_fn("gpi", ob_space,
                    ac_space)  # Construct network for new policy
    goldpi = policy_fn("goldpi", ob_space, ac_space)  # Network for old policy
    gatarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    gret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    # gob = U.get_placeholder_cached(name='ob')
    gac = gpi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    gkloldnew = goldpi.pd.kl(gpi.pd)
    gent = gpi.pd.entropy()
    gmeankl = tf.reduce_mean(gkloldnew)
    gmeanent = tf.reduce_mean(gent)
    gpol_entpen = (-entcoeff) * gmeanent

    ratio = tf.exp(pi.pd.logp(gac) - goldpi.pd.logp(gac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    gratio = tf.exp(gpi.pd.logp(ac) - oldpi.pd.logp(ac))
    gsurr1 = gratio * gatarg
    gsurr2 = tf.clip_by_value(gratio, 1.0 - clip_param,
                              1.0 + clip_param) * gatarg
    gpol_surr = -tf.reduce_mean(tf.minimum(gsurr1, gsurr2))
    gvf_loss = tf.reduce_mean(tf.square(gpi.vpred - gret))
    gtotal_loss = gpol_surr + gpol_entpen + gvf_loss
    glosses = [gpol_surr, gpol_entpen, gvf_loss, gmeankl, gmeanent]
    gloss_names = ["gpol_surr", "gpol_entpen", "gvf_loss", "gkl", "gent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, gac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    gvar_list = gpi.get_trainable_variables()
    glossandgrad = U.function([ob, ac, gatarg, gret, lrmult],
                              glosses + [U.flatgrad(gtotal_loss, gvar_list)])
    gadam = MpiAdam(gvar_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    gassign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(goldpi.get_variables(), gpi.get_variables())
        ])

    compute_losses = U.function([ob, gac, atarg, ret, lrmult], losses)
    gcompute_losses = U.function([ob, ac, gatarg, gret, lrmult], glosses)

    U.initialize()
    adam.sync()
    gadam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     gpi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)
    gseg_gen = traj_segment_generator(gpi,
                                      pi,
                                      genv,
                                      timesteps_per_actorbatch,
                                      stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()

    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    glenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    grewbuffer = deque(maxlen=100)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    def standarize(value):
        return (value - value.mean()) / (value.std())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        print("********** Iteration %i ************" % iters_so_far)

        print("********** Guided Policy ************")

        gseg = gseg_gen.__next__()
        add_vtarg_and_adv(gseg, gamma, lam)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)



        gob, gac, gatarg, gatarg_, gtdlamret, gtdlamret_ , gvpredbefore, gvpredbefore_ = gseg["ob"], gseg["ac"], \
                                gseg["adv"], gseg["adv_"], gseg["tdlamret"], gseg["tdlamret_"], gseg["vpred"], gseg["vpred_"]

        standarize(gatarg_)
        standarize(gatarg)

        gd = Dataset(dict(gob=gob,
                          gac=gac,
                          gatarg=gatarg,
                          gatarg_=gatarg_,
                          gvtarg=gtdlamret,
                          gvtarg_=gtdlamret_),
                     shuffle=not gpi.recurrent)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, atarg_, tdlamret, tdlamret_, vpredbefore, vpredbefore_ = seg["ob"], seg["ac"],\
                            seg["adv"], seg["adv_"], seg["tdlamret"], seg["tdlamret_"], seg["vpred"], gseg["vpred_"]

        standarize(atarg)
        standarize(atarg_)

        d = Dataset(dict(ob=ob,
                         ac=ac,
                         atarg=atarg,
                         atarg_=atarg_,
                         vtarg=tdlamret,
                         vtarg_=tdlamret_),
                    shuffle=not pi.recurrent)

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(gpi, "ob_rms"): gpi.ob_rms.update(ob)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(gob)  # update running mean/std for policy

        gassign_old_eq_new()
        print("Optimizing...Guided Policy")
        # print(fmt_row(13, gloss_names))

        # Here we do a bunch of optimization epochs over the data

        for _ in range(optim_epochs):
            glosses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = glossandgrad(batch["ob"], batch["ac"],
                                             batch["atarg_"], batch["vtarg_"],
                                             cur_lrmult)
                gadam.update(g, optim_stepsize * cur_lrmult)
                glosses.append(newlosses)
            # print(fmt_row(13, np.mean(glosses, axis=0)))

        # print("Evaluating losses...")
        glosses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = gcompute_losses(batch["ob"], batch["ac"],
                                        batch["atarg_"], batch["vtarg_"],
                                        cur_lrmult)
            glosses.append(newlosses)
        gmeanlosses, _, _ = mpi_moments(glosses, axis=0)
        # print(fmt_row(13, gmeanlosses))

        for (lossval, name) in zipsame(gmeanlosses, gloss_names):
            logger.record_tabular("gloss_" + name, lossval)
        # logger.record_tabular("gev_tdlam_before", explained_variance(vpredbefore, tdlamret))

        assign_old_eq_new()  # set old parameter values to new parameter values
        print("Optimizing...Training Policy")
        # print(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data

        optim_batchsize = optim_batchsize or gob.shape[0]

        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in gd.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["gob"], batch["gac"],
                                            batch["gatarg_"], batch["gvtarg_"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            # print(fmt_row(13, np.mean(losses, axis=0)))

        # print("Evaluating losses...")
        losses = []
        for batch in gd.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["gob"], batch["gac"],
                                       batch["gatarg_"], batch["gvtarg_"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        # print(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))

        glrlocal = (gseg["ep_lens"], gseg["ep_rets"])  # local values
        glistoflrpairs = MPI.COMM_WORLD.allgather(glrlocal)  # list of tuples
        glens, grews = map(flatten_lists, zip(*glistoflrpairs))

        # lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        grewbuffer.extend(grews)
        # logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("GEpRewMean", np.mean(grewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))

        # episodes_so_far += len(lens)
        # timesteps_so_far += sum(lens)
        iters_so_far += 1
        # logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        logger.logkv('trial', i_trial)
        logger.logkv("Iteration", iters_so_far)

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
コード例 #23
0
def enjoy(
        env,
        policy_func,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        save_name=None,
        save_per_acts=3,
        reload_name=None):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    if reload_name:
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), reload_name)
        print("Loaded model successfully.")

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
コード例 #24
0
def learn(
        env,
        test_env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        # CMAES
    max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        max_v_train_iter,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        # time constraint
        callback=None,
        # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        # annealing for stepsize parameters (epsilon and adam)
        seed,
        env_id):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution
    pi_zero = policy_fn(
        "zero_pi", ob_space,
        ac_space)  # pi_0 will only be updated along with iterations

    reward = tf.placeholder(dtype=tf.float32, shape=[None])  # step rewards
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    next_ob = U.get_placeholder_cached(
        name="next_ob")  # next step observation for updating q function
    ac = U.get_placeholder_cached(
        name="act")  # action placeholder for computing q function

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    pi_adv = pi.qpred - pi.vpred
    adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0])
    normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var)

    qf_loss = tf.reduce_mean(tf.square(reward + gamma * pi.vpred - pi.qpred))
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    qf_losses = [qf_loss]
    vf_losses = [vf_loss]
    pol_loss = -tf.reduce_mean(normalized_pi_adv)

    # Advantage function should be improved
    losses = [pol_loss, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr_2", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    qf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("qf")
    ]
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
    ]

    vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                                vf_losses + [U.flatgrad(vf_loss, vf_var_list)])

    qf_lossandgrad = U.function([ob, ac, next_ob, lrmult, reward],
                                qf_losses + [U.flatgrad(qf_loss, qf_var_list)])

    qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon)

    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])
    # Compute all losses

    mean_pi_actions = U.function(
        [ob], [pi.pd.mode()])  # later for computing pol_loss
    compute_pol_losses = U.function([ob, next_ob, ac], [pol_loss])

    U.initialize()

    get_pi_flat_params = U.GetFlat(pol_var_list)
    set_pi_flat_params = U.SetFromFlat(pol_var_list)

    vf_adam.sync()
    qf_adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness

    episodes_so_far = 0
    timesteps_so_far = 0
    ppo_timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    best_fitness = np.inf

    eval_gen = traj_segment_generator_eval(pi,
                                           test_env,
                                           timesteps_per_actorbatch,
                                           stochastic=True)  # For evaluation
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     eval_gen=eval_gen)  # For train V Func

    # Build generator for all solutions
    actors = []
    best_fitness = 0
    for i in range(popsize):
        newActor = traj_segment_generator(pi,
                                          env,
                                          timesteps_per_actorbatch,
                                          stochastic=True,
                                          eval_gen=eval_gen)
        actors.append(newActor)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            print("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            print("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            print("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            print("Max time")
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        # Generate new samples
        # Train V func
        for i in range(max_v_train_iter):
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train iter for V func:" + str(i))
            logger.log("Generate New Samples")
            seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)

            ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg["rew"], seg["tdlamret"], \
                                                        seg["traj_index"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(
                    ob)  # update running mean/std for normalization

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            # Train V function
            logger.log("Training V Func and Evaluating V Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                   batch["atarg"],
                                                   batch["vtarg"], cur_lrmult)
                    vf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(vf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            d_q = Dataset(dict(ob=ob,
                               ac=ac,
                               next_ob=next_ob,
                               reward=reward,
                               atarg=atarg,
                               vtarg=tdlamret),
                          shuffle=not pi.recurrent)

            # Re-train q function
            logger.log("Training Q Func Evaluating Q Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d_q.iterate_once(optim_batchsize):
                    *qf_losses, g = qf_lossandgrad(batch["next_ob"],
                                                   batch["ac"], batch["ob"],
                                                   cur_lrmult, batch["reward"])
                    qf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(qf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

        # CMAES Train Policy
        assign_old_eq_new()  # set old parameter values to new parameter values
        assign_backup_eq_new()  # backup current policy
        flatten_weights = get_pi_flat_params()
        opt = cma.CMAOptions()
        opt['tolfun'] = max_fitness
        opt['popsize'] = popsize
        opt['maxiter'] = gensize
        opt['verb_disp'] = 0
        opt['verb_log'] = 0
        opt['seed'] = seed
        opt['AdaptSigma'] = True
        es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt)
        while True:
            if es.countiter >= gensize:
                logger.log("Max generations for current layer")
                break
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train Generation for Policy:" +
                       str(es.countiter))
            logger.log("Sigma=" + str(es.sigma))
            solutions = es.ask()
            costs = []
            lens = []

            assign_backup_eq_new()  # backup current policy

            for id, solution in enumerate(solutions):
                set_pi_flat_params(solution)
                losses = []
                cost = compute_pol_losses(ob, ob, mean_pi_actions(ob)[0])
                costs.append(cost[0])
                assign_new_eq_backup()
            # Weights decay
            l2_decay = compute_weight_decay(0.99, solutions)
            costs += l2_decay
            # costs, real_costs = fitness_normalization(costs)
            costs, real_costs = fitness_rank(costs)
            es.tell_real_seg(solutions=solutions,
                             function_values=costs,
                             real_f=real_costs,
                             segs=None)
            best_solution = es.result[0]
            best_fitness = es.result[1]
            logger.log("Best Solution Fitness:" + str(best_fitness))
            set_pi_flat_params(best_solution)

        iters_so_far += 1
        episodes_so_far += sum(lens)
コード例 #25
0
def learn(env, policy_fn, *,
        timesteps_per_actorbatch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant' # annealing for stepsize parameters (epsilon and adam)
        ):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()
class PPO(object):
    def __init__(self, env, policy, 
                 emb_network, emb_size,
                 clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
                 optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
                 gamma, lam, # advantage estimation
                 max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
                 adam_epsilon=1e-5,
                 schedule='constant',
                 joint_training=False
                 ):
        # Setup variables
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.gamma = gamma
        self.lam = lam
        self.max_timesteps = max_timesteps
        self.adam_epsilon = adam_epsilon
        self.schedule = schedule

        # Setup losses and stuff
        # ----------------------------------------
        with tf.name_scope('ppo'):
            ob_space = env.observation_space
            ac_space = env.action_space
            self.pi = policy # Construct network for new policy
            oldpi = Policy("old_policy", env.action_space, joint_training, emb_size, emb_network) # Network for old policy
            atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
            ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

            lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
            clip_param = clip_param * lrmult # Annealed cliping parameter epislon

            # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))
            if joint_training:
                ob = U.get_placeholder_cached(name="ob_f")
            else:
                ob = U.get_placeholder_cached(name="ob")
            ac = self.pi.pdtype.sample_placeholder([None])

            kloldnew = oldpi.pd.kl(self.pi.pd)
            ent = self.pi.pd.entropy()
            meankl = U.mean(kloldnew)
            meanent = U.mean(ent)
            pol_entpen = (-entcoeff) * meanent

            ratio = tf.exp(self.pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
            surr1 = ratio * atarg # surrogate from conservative policy iteration
            surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
            pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
            vf_loss = U.mean(tf.square(self.pi.vpred - ret))
            self.total_loss = pol_surr + pol_entpen + vf_loss
            losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
            self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

            var_list = self.pi.get_trainable_variables()
            self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(self.total_loss, var_list)])
            self.adam = MpiAdam(var_list, epsilon=adam_epsilon)

            self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(oldpi.get_variables(), self.pi.get_variables())])
            self.compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

            U.initialize()
            self.adam.sync()

        # Prepare for rollouts
        # ----------------------------------------
        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.iters_so_far = 0
        self.tstart = time.time()
        self.lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
        self.rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

        # self.train_step = tf.train.AdamOptimizer(adam_epsilon).minimize(self.total_loss, var_list=var_list)
        # self.train = U.function([ob, ac, atarg, ret, lrmult], self.train_step)


    def prepare(self, batch):
        if self.timesteps_so_far >= self.max_timesteps:
            return False

        if self.schedule == 'constant':
            self.cur_lrmult = 1.0
        elif self.schedule == 'linear':
            self.cur_lrmult =  max(1.0 - float(self.timesteps_so_far) / (self.max_timesteps * 1.1), 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%self.iters_so_far)

        # seg = seg_gen.__next__() # generate next sequence
        self.seg = batch
        self.add_vtarg_and_adv(self.seg, self.gamma, self.lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, self.tdlamret = self.seg["ob"], self.seg["ac"], self.seg["adv"], self.seg["tdlamret"]
        self.vpredbefore = self.seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        self.d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=self.tdlamret), shuffle=not self.pi.recurrent)
        self.optim_batchsize = self.optim_batchsize or ob.shape[0]

        self.assign_old_eq_new() # set old parameter values to new parameter values

        logger.log("Optimizing...")
        logger.log(fmt_row(13, self.loss_names))

        return True


    def step(self):
        # Here we do a bunch of optimization epochs over the data
        for _ in range(self.optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for b in self.d.iterate_once(self.optim_batchsize):
                # self.train(b["ob"], b["ac"], b["atarg"], b["vtarg"], self.cur_lrmult)
                *newlosses, g = self.lossandgrad(b["ob"], b["ac"], b["atarg"], b["vtarg"], self.cur_lrmult)
                self.adam.update(g, self.optim_stepsize * self.cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

    
    def log(self):
        logger.log("Evaluating losses...")
        losses = []
        for b in self.d.iterate_once(self.optim_batchsize):
            newlosses = self.compute_losses(b["ob"], b["ac"], b["atarg"], b["vtarg"], self.cur_lrmult)
            losses.append(newlosses)            
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, self.loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(self.vpredbefore, self.tdlamret))
        lrlocal = (self.seg["ep_lens"], self.seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(self.flatten_lists, zip(*listoflrpairs))
        self.lenbuffer.extend(lens)
        self.rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(self.lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(self.rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        self.episodes_so_far += len(lens)
        self.timesteps_so_far += sum(lens)
        self.iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        logger.record_tabular("TimestepsSoFar", self.timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - self.tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()


    def add_vtarg_and_adv(self, seg, gamma, lam):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
        vpred = np.append(seg["vpred"], seg["nextvpred"])
        T = len(seg["rew"])
        seg["adv"] = gaelam = np.empty(T, 'float32')
        rew = seg["rew"]
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1-new[t+1]
            delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
            gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
        seg["tdlamret"] = seg["adv"] + seg["vpred"]

    def flatten_lists(self, listoflists):
        return [el for list_ in listoflists for el in list_]
コード例 #27
0
    def train(self, seg, optim_batchsize, optim_epochs):
        #normalize the reward
        rffs_int = np.array(
            [self.rff_int.update(rew) for rew in seg["rew_int"]])
        self.rff_rms_int.update(rffs_int.ravel())
        seg["rew_int"] = seg["rew_int"] / np.sqrt(self.rff_rms_int.var)

        cur_lrmult = 1.0
        add_vtarg_and_adv(seg, self.gamma, self.lam)
        ob, unnorm_ac, atarg_ext, tdlamret_ext, atarg_int, tdlamret_int = seg[
            "ob"], seg["unnorm_ac"], seg["adv_ext"], seg["tdlamret_ext"], seg[
                "adv_int"], seg["tdlamret_int"]
        vpredbefore_ext, vpredbefore_int = seg["vpred_ext"], seg[
            "vpred_int"]  # predicted value function before udpate
        atarg_ext = (atarg_ext - atarg_ext.mean()) / atarg_ext.std(
        )  # standardized advantage function estimate
        atarg_int = (atarg_int - atarg_int.mean()) / atarg_int.std()
        atarg = self.int_coeff * atarg_int + self.ext_coeff * atarg_ext

        d = Dataset(dict(ob=ob,
                         ac=unnorm_ac,
                         atarg=atarg,
                         vtarg_ext=tdlamret_ext,
                         vtarg_int=tdlamret_int),
                    shuffle=not self.pi.recurrent)

        if hasattr(self.pi, "ob_rms"):
            self.pi.update_obs_rms(ob)  # update running mean/std for policy
        if hasattr(self.int_rew, "ob_rms"):
            self.int_rew.update_obs_rms(
                ob)  #update running mean/std for int_rew
        self.assign_old_eq_new(
        )  # set old parameter values to new parameter values
        logger.log2("Optimizing...")
        logger.log2(fmt_row(13, self.loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                lg = self.lossandgrad(batch["ac"], batch["atarg"],
                                      batch["vtarg_ext"], batch["vtarg_int"],
                                      cur_lrmult, *zip(*batch["ob"].tolist()))
                new_losses, g = lg[:-1], lg[-1]
                self.adam.update(g, self.optim_stepsize * cur_lrmult)
                losses.append(new_losses)
            logger.log2(fmt_row(13, np.mean(losses, axis=0)))

        logger.log2("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = self.compute_losses(batch["ac"], batch["atarg"],
                                            batch["vtarg_ext"],
                                            batch["vtarg_int"], cur_lrmult,
                                            *zip(*batch["ob"].tolist()))
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log2(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, self.loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular(
            "ev_tdlam_ext_before",
            explained_variance(vpredbefore_ext, tdlamret_ext))
        return meanlosses
コード例 #28
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        # CMAES
    max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        max_v_train_iter,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        # time constraint
        callback=None,
        # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        # annealing for stepsize parameters (epsilon and adam)
        seed,
        env_id):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution

    pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    layer_clip = tf.placeholder(
        name='layer_clip', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    bound_coeff = tf.placeholder(
        name='bound_coeff', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    clip_param = clip_param * lrmult * layer_clip  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - (oldpi.pd.logp(ac) + 1e-8))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    vf_losses = [vf_loss]
    vf_loss_names = ["vf_loss"]

    pol_loss = pol_surr + pol_entpen
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
    ]

    layer_var_list = []
    for i in range(pi.num_hid_layers):
        layer_var_list.append([
            v for v in pol_var_list
            if v.name.split("/")[2].startswith('fc%i' % (i + 1))
        ])
    logstd_var_list = [
        v for v in pol_var_list if v.name.split("/")[2].startswith("logstd")
    ]
    if len(logstd_var_list) != 0:
        layer_var_list.append([
            v for v in pol_var_list if v.name.split("/")[2].startswith("final")
        ] + logstd_var_list)

    vf_lossandgrad = U.function([ob, ac, ret, lrmult],
                                vf_losses + [U.flatgrad(vf_loss, vf_var_list)])

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, layer_clip],
                             losses + [U.flatgrad(total_loss, var_list)])

    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])
    # Compute all losses

    compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult, layer_clip],
                                    [pol_loss, pol_surr, pol_entpen, meankl])

    compute_v_pred = U.function([ob], [pi.vpred])

    a_prob = tf.exp(pi.pd.logp(ac))
    compute_a_prob = U.function([ob, ac], [a_prob])

    U.initialize()

    layer_set_operate_list = []
    layer_get_operate_list = []
    for var in layer_var_list:
        set_pi_layer_flat_params = U.SetFromFlat(var)
        layer_set_operate_list.append(set_pi_layer_flat_params)
        get_pi_layer_flat_params = U.GetFlat(var)
        layer_get_operate_list.append(get_pi_layer_flat_params)

    # get_pi_layer_flat_params = U.GetFlat(pol_var_list)
    # set_pi_layer_flat_params = U.SetFromFlat(pol_var_list)

    vf_adam.sync()

    adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness

    episodes_so_far = 0
    timesteps_so_far = 0
    ppo_timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    best_fitness = -np.inf

    eval_seq = traj_segment_generator_eval(pi,
                                           env,
                                           timesteps_per_actorbatch,
                                           stochastic=False)
    # eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic = True)  # For evaluation
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     eval_seq=eval_seq)  # For train V Func

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    indices = []  # maintain all selected indices for each iteration

    opt = cma.CMAOptions()
    opt['tolfun'] = max_fitness
    opt['popsize'] = popsize
    opt['maxiter'] = gensize
    opt['verb_disp'] = 0
    opt['verb_log'] = 0
    # opt['seed'] = seed
    opt['AdaptSigma'] = True
    # opt['bounds'] = bounds
    # opt['tolstagnation'] = 20
    ess = []
    seg = None
    segs = None
    sum_vpred = []
    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            print("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            print("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            print("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            print("Max time")
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps),
                             0)
        else:
            raise NotImplementedError

        # epsilon = max(0.5 - float(timesteps_so_far) / (max_timesteps), 0) * cur_lrmult
        epsilon = max(0.5 * cur_lrmult, 0)
        # epsilon = 0.2
        sigma_adapted = max(sigma * cur_lrmult, 1e-8)
        # sigma_adapted = max(max(sigma - float(timesteps_so_far) / (5000 * max_timesteps), 0) * cur_lrmult, 1e-8)
        # cmean_adapted = max(1.0 - float(timesteps_so_far) / (max_timesteps), 1e-8)
        # cmean_adapted = max(0.8 - float(time˚steps_so_far) / (2*max_timesteps), 1e-8)
        # if timesteps_so_far % max_timesteps == 10:
        max_v_train_iter = int(
            max(
                max_v_train_iter * (1 - timesteps_so_far /
                                    (0.5 * max_timesteps)), 1))
        logger.log("********** Iteration %i ************" % iters_so_far)
        if iters_so_far == 0:
            eval_seg = eval_seq.__next__()
            rewbuffer.extend(eval_seg["ep_rets"])
            lenbuffer.extend(eval_seg["ep_lens"])
            result_record()

        # Repository Train
        train_segs = {}
        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(
                seg["ob"])  # update running mean/std for normalization

        # rewbuffer.extend(seg["ep_rets"])
        # lenbuffer.extend(seg["ep_lens"])
        #
        # if iters_so_far == 0:
        #     result_record()

        assign_old_eq_new()  # set old parameter values to new parameter values
        if segs is None:
            segs = seg
            segs["v_target"] = np.zeros(len(seg["ob"]), 'float32')
        elif len(segs["ob"]) >= 50000:
            segs["ob"] = np.take(segs["ob"],
                                 np.arange(timesteps_per_actorbatch,
                                           len(segs["ob"])),
                                 axis=0)
            segs["next_ob"] = np.take(segs["next_ob"],
                                      np.arange(timesteps_per_actorbatch,
                                                len(segs["next_ob"])),
                                      axis=0)
            segs["ac"] = np.take(segs["ac"],
                                 np.arange(timesteps_per_actorbatch,
                                           len(segs["ac"])),
                                 axis=0)
            segs["rew"] = np.take(segs["rew"],
                                  np.arange(timesteps_per_actorbatch,
                                            len(segs["rew"])),
                                  axis=0)
            segs["vpred"] = np.take(segs["vpred"],
                                    np.arange(timesteps_per_actorbatch,
                                              len(segs["vpred"])),
                                    axis=0)
            segs["act_props"] = np.take(segs["act_props"],
                                        np.arange(timesteps_per_actorbatch,
                                                  len(segs["act_props"])),
                                        axis=0)
            segs["new"] = np.take(segs["new"],
                                  np.arange(timesteps_per_actorbatch,
                                            len(segs["new"])),
                                  axis=0)
            segs["adv"] = np.take(segs["adv"],
                                  np.arange(timesteps_per_actorbatch,
                                            len(segs["adv"])),
                                  axis=0)
            segs["tdlamret"] = np.take(segs["tdlamret"],
                                       np.arange(timesteps_per_actorbatch,
                                                 len(segs["tdlamret"])),
                                       axis=0)
            segs["ep_rets"] = np.take(segs["ep_rets"],
                                      np.arange(timesteps_per_actorbatch,
                                                len(segs["ep_rets"])),
                                      axis=0)
            segs["ep_lens"] = np.take(segs["ep_lens"],
                                      np.arange(timesteps_per_actorbatch,
                                                len(segs["ep_lens"])),
                                      axis=0)
            segs["v_target"] = np.take(segs["v_target"],
                                       np.arange(timesteps_per_actorbatch,
                                                 len(segs["v_target"])),
                                       axis=0)
            segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0)
            segs["next_ob"] = np.append(segs['next_ob'],
                                        seg['next_ob'],
                                        axis=0)
            segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0)
            segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0)
            segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0)
            segs["act_props"] = np.append(segs['act_props'],
                                          seg['act_props'],
                                          axis=0)
            segs["new"] = np.append(segs['new'], seg['new'], axis=0)
            segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0)
            segs["tdlamret"] = np.append(segs['tdlamret'],
                                         seg['tdlamret'],
                                         axis=0)
            segs["ep_rets"] = np.append(segs['ep_rets'],
                                        seg['ep_rets'],
                                        axis=0)
            segs["ep_lens"] = np.append(segs['ep_lens'],
                                        seg['ep_lens'],
                                        axis=0)
            segs["v_target"] = np.append(segs['v_target'],
                                         np.zeros(len(seg["ob"]), 'float32'),
                                         axis=0)
        else:
            segs["ob"] = np.append(segs['ob'], seg['ob'], axis=0)
            segs["next_ob"] = np.append(segs['next_ob'],
                                        seg['next_ob'],
                                        axis=0)
            segs["ac"] = np.append(segs['ac'], seg['ac'], axis=0)
            segs["rew"] = np.append(segs['rew'], seg['rew'], axis=0)
            segs["vpred"] = np.append(segs['vpred'], seg['vpred'], axis=0)
            segs["act_props"] = np.append(segs['act_props'],
                                          seg['act_props'],
                                          axis=0)
            segs["new"] = np.append(segs['new'], seg['new'], axis=0)
            segs["adv"] = np.append(segs['adv'], seg['adv'], axis=0)
            segs["tdlamret"] = np.append(segs['tdlamret'],
                                         seg['tdlamret'],
                                         axis=0)
            segs["ep_rets"] = np.append(segs['ep_rets'],
                                        seg['ep_rets'],
                                        axis=0)
            segs["ep_lens"] = np.append(segs['ep_lens'],
                                        seg['ep_lens'],
                                        axis=0)
            segs["v_target"] = np.append(segs['v_target'],
                                         np.zeros(len(seg["ob"]), 'float32'),
                                         axis=0)

        if iters_so_far == 0:
            ob, ac, tdlamret = seg["ob"], seg["ac"], seg["tdlamret"]
            d = Dataset(dict(ob=ob, ac=ac, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]

            # Train V function
            # logger.log("Catchup Training V Func and Evaluating V Func Losses")
            for _ in range(max_v_train_iter):
                for batch in d.iterate_once(optim_batchsize):
                    *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                 batch["vtarg"], cur_lrmult)
                    vf_adam.update(g, optim_stepsize * cur_lrmult)
                # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0)))
        else:

            # Update v target
            new = segs["new"]
            rew = segs["rew"]
            act_prob = np.asarray(compute_a_prob(segs["ob"], segs["ac"])).T
            importance_ratio = np.squeeze(act_prob) / (
                segs["act_props"] + np.ones(segs["act_props"].shape) * 1e-8)
            segs["v_target"] = importance_ratio * (1 / np.sum(importance_ratio)) * \
                               np.squeeze(
                                   rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(segs["next_ob"]))
            # train_segs["v_target"] = rew + np.invert(new).astype(np.float32) * gamma * compute_v_pred(train_segs["next_ob"])
            if len(segs["ob"]) >= 20000:
                train_times = int(max_v_train_iter /
                                  2) if int(max_v_train_iter / 2) > 0 else 1
            else:
                train_times = 2
            for i in range(train_times):
                selected_train_index = np.random.choice(
                    range(len(segs["ob"])),
                    timesteps_per_actorbatch,
                    replace=False)
                train_segs["ob"] = np.take(segs["ob"],
                                           selected_train_index,
                                           axis=0)
                train_segs["next_ob"] = np.take(segs["next_ob"],
                                                selected_train_index,
                                                axis=0)
                train_segs["ac"] = np.take(segs["ac"],
                                           selected_train_index,
                                           axis=0)
                train_segs["rew"] = np.take(segs["rew"],
                                            selected_train_index,
                                            axis=0)
                train_segs["vpred"] = np.take(segs["vpred"],
                                              selected_train_index,
                                              axis=0)
                train_segs["new"] = np.take(segs["new"],
                                            selected_train_index,
                                            axis=0)
                train_segs["adv"] = np.take(segs["adv"],
                                            selected_train_index,
                                            axis=0)
                train_segs["tdlamret"] = np.take(segs["tdlamret"],
                                                 selected_train_index,
                                                 axis=0)
                train_segs["v_target"] = np.take(segs["v_target"],
                                                 selected_train_index,
                                                 axis=0)
                #
                ob, ac, v_target = train_segs["ob"], train_segs[
                    "ac"], train_segs["v_target"]
                d = Dataset(dict(ob=ob, ac=ac, vtarg=v_target),
                            shuffle=not pi.recurrent)
                optim_batchsize = optim_batchsize or ob.shape[0]

                # Train V function
                # logger.log("Training V Func and Evaluating V Func Losses")
                # Train V function
                # logger.log("Catchup Training V Func and Evaluating V Func Losses")
                # logger.log("Train V - "+str(_))
                for _ in range(max_v_train_iter):
                    for batch in d.iterate_once(optim_batchsize):
                        *vf_loss, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                     batch["vtarg"],
                                                     cur_lrmult)
                        vf_adam.update(g, optim_stepsize * cur_lrmult)
                    # logger.log(fmt_row(13, np.mean(vf_losses, axis = 0)))
                # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape)
                # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1])
                # add_vtarg_and_adv(seg, gamma, lam)

            ob, ac, atarg, v_target = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=v_target),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]
            # Local search
            for _ in range(optim_epochs):
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult, 1 / 4)
                    adam.update(g, optim_stepsize * cur_lrmult)

            # seg['vpred'] = np.asarray(compute_v_pred(seg["ob"])).reshape(seg['vpred'].shape)
            # seg['nextvpred'] = seg['vpred'][-1] * (1 - seg["new"][-1])
            # add_vtarg_and_adv(seg, gamma, lam)

        ob_po, ac_po, atarg_po, tdlamret_po = seg["ob"], seg["ac"], seg[
            "adv"], seg["tdlamret"]
        atarg_po = (atarg_po - atarg_po.mean()) / atarg_po.std(
        )  # standardized advantage function estimate

        # opt['CMA_cmean'] = cmean_adapted
        # assign_old_eq_new()  # set old parameter values to new parameter values
        for i in range(len(layer_var_list)):
            # CMAES Train Policy
            assign_backup_eq_new()  # backup current policy
            flatten_weights = layer_get_operate_list[i]()

            if len(indices) < len(layer_var_list):
                selected_index, init_weights = uniform_select(
                    flatten_weights,
                    0.5)  # 0.5 means 50% proportion of params are selected
                indices.append(selected_index)
            else:
                rand = np.random.uniform()
                # print("Random-Number:", rand)
                # print("Epsilon:", epsilon)
                if rand < epsilon:
                    selected_index, init_weights = uniform_select(
                        flatten_weights, 0.5)
                    indices.append(selected_index)
                    # logger.log("Random: select new weights")
                else:
                    selected_index = indices[i]
                    init_weights = np.take(flatten_weights, selected_index)
            es = cma.CMAEvolutionStrategy(init_weights, sigma_adapted, opt)
            while True:
                if es.countiter >= gensize:
                    # logger.log("Max generations for current layer")
                    break
                # logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter))
                # logger.log("Sigma=" + str(es.sigma))
                # solutions = es.ask(sigma_fac = max(cur_lrmult, 1e-8))
                solutions = es.ask()
                # solutions = [np.clip(solution, -5.0, 5.0).tolist() for solution in solutions]
                costs = []
                lens = []

                assign_backup_eq_new()  # backup current policy

                for id, solution in enumerate(solutions):
                    np.put(flatten_weights, selected_index, solution)
                    layer_set_operate_list[i](flatten_weights)
                    cost = compute_pol_losses(ob_po, ac_po, atarg_po,
                                              tdlamret_po, cur_lrmult,
                                              1 / 4 * (i + 1))
                    costs.append(cost[0])
                    assign_new_eq_backup()
                # Weights decay
                l2_decay = compute_weight_decay(0.01, solutions)
                costs += l2_decay
                costs, real_costs = fitness_rank(costs)
                # logger.log("real_costs:"+str(real_costs))
                # best_solution = np.copy(es.result[0])
                # best_fitness = -es.result[1]
                es.tell_real_seg(solutions=solutions,
                                 function_values=costs,
                                 real_f=real_costs,
                                 segs=None)
                # best_solution = np.copy(solutions[np.argmin(costs)])
                # best_fitness = -real_costs[np.argmin(costs)]
                best_solution = es.result[0]
                best_fitness = es.result[1]
                np.put(flatten_weights, selected_index, best_solution)
                layer_set_operate_list[i](flatten_weights)
                # logger.log("Update the layer")
                # best_solution = es.result[0]
                # best_fitness = es.result[1]
                # logger.log("Best Solution Fitness:" + str(best_fitness))
                # set_pi_flat_params(best_solution)
            import gc
            gc.collect()

        iters_so_far += 1
        episodes_so_far += sum(lens)
コード例 #29
0
def learn(env_list, policy_fn, *,
          timesteps_per_actorbatch,  # timesteps per actor per update
          clip_param, entcoeff,  # clipping parameter epsilon, entropy coeff
          optim_epochs, optim_stepsize, optim_batchsize,  # optimization hypers
          gamma, lam,  # advantage estimation
          max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
          callback=None,  # you can do anything in the callback, since it takes locals(), globals()
          adam_epsilon=1e-5,
          schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
          end_timesteps,
          newround
          ):

    env = env_list.popleft()
    # Open a file to record the accumulated rewards
    rewFile = open("reward/%d.txt" % (env.seed), "ab")
    resptimeFile = open("respTime/%d.txt" % (env.seed), "ab")
    pktnumFile = open("pktNum/%d.txt" % (env.seed), "ab")

    # Setup losses and stuff
    # ----------------------------------------
    vf_ob_space = env.vf_observation_space
    # ac_ob_space = env.ac_observation_space
    ac_space = env.action_space
    pi = policy_fn("pi1", vf_ob_space, ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", vf_ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(name="atarg", dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(name="ret", dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed clipping parameter epislon

    vf_ob = U.get_placeholder_cached(name="vf_ob")
    nn_in = U.get_placeholder_cached(name="nn_in")  # placeholder for nn input
    ac = pi.pdtype.sample_placeholder([None])

    # kloldnew = oldpi.pd.kl(pi.pd)
    # ent = pi.pd.entropy()
    pb_old_holder = tf.placeholder(name="pd_old", dtype=tf.float32, shape=[None, ac_space.n])
    pb_new_holder = tf.placeholder(name="pd_new", dtype=tf.float32, shape=[None, ac_space.n])
    oldpd = CategoricalPd(pb_old_holder)
    pd = CategoricalPd(pb_new_holder)
    kloldnew = oldpd.kl(pd)
    ent = pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    # ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    ratio = tf.placeholder(dtype=tf.float32, shape=[None])
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    vf_var_list = [v for v in var_list if v.name.split("/")[1].startswith("vf")]
    pol_var_list = [v for v in var_list if v.name.split("/")[1].startswith("pol")]

    vf_grad = U.function([vf_ob, ret], U.flatgrad(vf_loss, vf_var_list))  # gradient of value function
    pol_nn_grad = U.function([nn_in], U.flatgrad(pi.nn_out, pol_var_list))
    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)
    pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon)
    clip_para = U.function([lrmult], [clip_param])

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([vf_ob, atarg, ret, lrmult, ratio, pb_new_holder, pb_old_holder], losses)

    U.initialize()
    vf_adam.sync()
    pol_adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    end_timestep = end_timesteps.popleft()
    new = newround.popleft()
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=10)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=10)  # rolling buffer for episode rewards
    env_so_far = 1

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0,
                max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            rewFile.close()
            resptimeFile.close()
            pktnumFile.close()
            para = {}
            for vf in range(len(vf_var_list)):
                # para[vf_var_list[vf].name] = vf_var_list[vf].eval()
                para[vf] = vf_var_list[vf].eval()
            for pol in range(len(pol_var_list)):
                # para[pol_var_list[pol].name] = pol_var_list[pol].eval()
                para[pol + len(vf_var_list)] = pol_var_list[pol].eval()
            f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb")
            pickle.dump(para, f)
            f.close()
            print("============================= policy is stored =================================")
            break
        elif end_timestep and timesteps_so_far >= end_timestep:
            env = env_list.popleft()
            seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)
            end_timestep = end_timesteps.popleft()
            new = newround.popleft()
            env_so_far += 1
            if True:
                para = {}
                for vf in range(len(vf_var_list)):
                    # para[vf_var_list[vf].name] = vf_var_list[vf].eval()
                    para[vf] = vf_var_list[vf].eval()
                for pol in range(len(pol_var_list)):
                    # para[pol_var_list[pol].name] = pol_var_list[pol].eval()
                    para[pol + len(vf_var_list)] = pol_var_list[pol].eval()
                f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb")
                pickle.dump(para, f)
                f.close()
            print("======================== new environment (%s network settings left) ===========================" % len(env_list))
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break
        elif timesteps_so_far == 0:
            para = {}
            for vf in range(len(vf_var_list)):
                # para[vf_var_list[vf].name] = vf_var_list[vf].eval()
                para[vf] = vf_var_list[vf].eval()
            for pol in range(len(pol_var_list)):
                # para[pol_var_list[pol].name] = pol_var_list[pol].eval()
                para[pol + len(vf_var_list)] = pol_var_list[pol].eval()
            f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb")
            pickle.dump(para, f)
            f.close()

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i, Environment %i ************" % (iters_so_far, env_so_far))

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # for vf in range(len(vf_var_list)):
        #     print(vf_var_list[vf].name, vf_var_list[vf].eval())
        # for pol in range(len(pol_var_list)):
        #     print(pol_var_list[pol].name, pol_var_list[pol].eval())

        record_reward(rewFile, sum(seg["rew"]))
        record_reward(resptimeFile, sum(seg["resptime"]))
        record_reward(pktnumFile, sum(seg["pktnum"]))
        print("total rewards for Iteration %s: %s" % (iters_so_far, sum(seg["rew"])))
        print("average response time: %s, num of pkts: %s" % (sum(seg["resptime"])/sum(seg["pktnum"]), sum(seg["pktnum"])))
        prob = collections.Counter(seg["ac"])  # a dict where elements are stored as dictionary keys and their counts are stored as dictionary values.
        for key in prob:
            prob[key] = prob[key]/len(seg["ac"])
        print("percentage of choosing each controller: %s" % (prob))

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        vf_ob, ac_ob, ac, atarg, tdlamret = seg["vf_ob"], seg['ac_ob'], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(vf_ob=vf_ob, ac_ob=ac_ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or vf_ob.shape[0]

        # if hasattr(pi, "vf_ob_rms"): pi.vf_ob_rms.update(vf_ob)  # update running mean/std for policy
        # if hasattr(pi, "nn_in_rms"):
        #     temp = ac_ob.reshape(-1,ac_ob.shape[2])
        #     pi.nn_in_rms.update(temp)

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = []  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                # calculate the value function gradient
                vf_g = vf_grad(batch["vf_ob"], batch["vtarg"])
                vf_adam.update(vf_g, optim_stepsize * cur_lrmult)

                # calculate the policy gradient
                pol_g = []
                ratios = []
                pbs_new_batch = []
                pbs_old_batch = []
                e = clip_para(cur_lrmult)[0]
                for sample_id in range(optim_batchsize):
                    sample_ac_ob = batch["ac_ob"][sample_id]
                    sample_ac = batch["ac"][sample_id]
                    probs_new = pi.calculate_ac_prob(sample_ac_ob)
                    prob_new = probs_new[sample_ac]
                    probs_old = oldpi.calculate_ac_prob(sample_ac_ob)
                    prob_old = probs_old[sample_ac]
                    if prob_old == 0:
                        logger.error("pi_old = 0 in %s th iteration %s th epoch %s th sample..." % (iters_so_far, _, sample_id))
                    r = prob_new / prob_old
                    ratios.append(r)
                    pbs_new_batch.append(probs_new)
                    pbs_old_batch.append(probs_old)
                    if (r > 1.0 + e and batch["atarg"][sample_id] > 0) or (r < 1.0 - e and batch["atarg"][sample_id] < 0) or r == 0:
                        dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1))
                        pol_g.append(0.*dnn_dtheta)
                    else:
                        nn = pi.calculate_ac_value(sample_ac_ob)
                        denominator = np.power(sum(nn), 2)
                        sorted_ind = np.argsort(nn)  # sort the array in ascending order
                        if len(probs_new) == 2:
                            if sample_ac == 0:
                                numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1))
                                numerator2 = nn[0] * pol_nn_grad(sample_ac_ob[1].reshape(1, -1))
                                dpi_dtheta = -(numerator1-numerator2)/denominator
                            else:
                                numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1))
                                numerator2 = nn[0]*pol_nn_grad(sample_ac_ob[1].reshape(1,-1))
                                dpi_dtheta = -(numerator2 - numerator1)/denominator

                            # numerator1 = nn[sorted_ind[0]]*pol_nn_grad(sample_ac_ob[sorted_ind[1]].reshape(1,-1))
                            # numerator2 = nn[sorted_ind[1]]*pol_nn_grad(sample_ac_ob[sorted_ind[0]].reshape(1,-1))
                            # dpi_dtheta = (numerator1-numerator2)/denominator

                        elif len(probs_new) == 3:
                            if sample_ac == sorted_ind[0]:
                                # the controller with lowest probability will still possible to be chosen because the probability is not zero
                                dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1))
                                pol_g.append(0. * dnn_dtheta)
                            else:
                                numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac].reshape(1,-1)) + 0.5 * pol_nn_grad(
                                    sample_ac_ob[sorted_ind[0]].reshape(1, -1)))
                                numerator2 = (nn[sample_ac] + 0.5 * nn[sorted_ind[0]]) * pol_nn_grad(sample_ac_ob)
                                dpi_dtheta = -(numerator1 - numerator2) / denominator
                        else:
                            if sample_ac == sorted_ind[-1] or sample_ac == sorted_ind[-2]:
                                numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac] .reshape(1,-1))+0.5*pol_nn_grad(sample_ac_ob[sorted_ind[0:-2]]))
                                numerator2 = (nn[sample_ac]+0.5*sum(nn[sorted_ind[0:-2]])) * pol_nn_grad(sample_ac_ob)
                                dpi_dtheta = -(numerator1 - numerator2) / denominator
                            else:
                                dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1))
                                pol_g.append(0. * dnn_dtheta)
                        pol_g.append(batch["atarg"][sample_id] * dpi_dtheta / prob_old)

                pol_g_mean = np.mean(np.array(pol_g), axis=0)
                pol_adam.update(pol_g_mean, optim_stepsize * cur_lrmult)

                newlosses = compute_losses(batch["vf_ob"], batch["atarg"], batch["vtarg"],
                                           cur_lrmult, np.array(ratios), np.array(pbs_new_batch), np.array(pbs_old_batch))

                # adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        # losses = []
        # for batch in d.iterate_once(optim_batchsize):
        #     newlosses = compute_losses(batch["vf_ob"], batch["ac_ob"], batch["ac"], batch["atarg"], batch["vtarg"],
        #                                cur_lrmult)
        #     losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        if len(lenbuffer) == 0:
            logger.record_tabular("EpLenMean", 0)
            logger.record_tabular("EpRewMean", 0)
        else:
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
コード例 #30
0
ファイル: pposgd_simple.py プロジェクト: zhaokang1228/my_git
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()

    ## losses + [U.flatgrad(total_loss, var_list)] 这个是怎么相加的
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    test_a = U.function([ob, ac, atarg, ret, lrmult], [
        kloldnew, ent, meankl, meanent, pol_entpen,
        pi.pd.logp(ac),
        oldpi.pd.logp(ac), ratio, surr1, surr2, pi.vpred
    ])

    ####################
    pi_parms = U.function([], var_list)
    old_list = oldpi.get_trainable_variables()
    old_parms = U.function([], old_list)
    ####################

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        # print("ac",np.shape(seg["ac"]), seg["ac"])
        # print("rew",np.shape(seg["rew"]), seg["rew"])
        # print("vpred",np.shape(seg["vpred"]), seg["vpred"])
        # print("new",np.shape(seg["new"]), seg["new"])
        # print("prevac",np.shape(seg["prevac"]), seg["prevac"])
        # print("nextvpred",np.shape(seg["nextvpred"]), seg["nextvpred"])
        # print("ep_rets",np.shape(seg["ep_rets"]), seg["ep_rets"])
        # print("ep_lens",np.shape(seg["ep_lens"]), seg["ep_lens"])
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    deterministic=pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")

        # ############
        # for p in pi_parms():
        #     print("pi", np.sum(p))
        # for p in old_parms():
        #     print("old", np.sum(p))
        # ############
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                # kloldnew,ent, meankl, meanent, pol_entpen, piac, oldpiac, ratio, surr1, surr2, pivpred = \
                #     test_a(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                # print("kloldnew",kloldnew)
                # print("ent",ent)
                # print("meankl",meankl)
                # print("meanent",meanent)
                # print("pol_entpen",pol_entpen)
                # print("piac",piac)
                # print("oldpiac",oldpiac)
                # print("ratio",ratio)
                # print("surr1",surr1)
                # print("surr2",surr2)
                # print("pivpred",pivpred)
                for p in pi_parms():
                    print("pi", np.sum(p))
                for p in old_parms():
                    print("old", np.sum(p))
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    return pi
コード例 #31
0
def learn(env, policy_func, reward_giver, expert_dataset, rank, 
          pretrained, pretrained_weight, *, clip_param,
          g_step, d_step, entcoeff, save_per_iter,
          optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
          ckpt_dir, log_dir, timesteps_per_batch, task_name,
          gamma, lam, d_stepsize=3e-4, adam_epsilon=1e-5,
          max_timesteps=0, max_episodes=0, max_iters=0,
          mix_reward=False, r_lambda=0.44,
          callback=None,
          schedule='constant', # annealing for stepsize parameters (epsilon and adam),
          frame_stack=1
          ):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ob_space.shape = (ob_space.shape[0] * frame_stack,)
    print(ob_space)
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    # kloldnew = oldpi.pd.kl(pi.pd)
    # ent = pi.pd.entropy()
    # meankl = tf.reduce_mean(kloldnew)
    # meanent = tf.reduce_mean(ent)
    # entbonus = entcoeff * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    # vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    # ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # advantage * pnew / pold
    # surrgain = tf.reduce_mean(ratio * atarg)

    # optimgain = surrgain + entbonus
    # losses = [optimgain, meankl, entbonus, surrgain, meanent]
    # loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    d_adam = MpiAdam(reward_giver.get_trainable_variables())

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    # dist = meankl

    # all_var_list = pi.get_trainable_variables()
    # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
    # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    # assert len(var_list) == len(vf_var_list) + 1
    # d_adam = MpiAdam(reward_giver.get_trainable_variables())
    # vfadam = MpiAdam(vf_var_list)

    # get_flat = U.GetFlat(var_list)
    # set_from_flat = U.SetFromFlat(var_list)
    # klgrads = tf.gradients(dist, var_list)
    # flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    # shapes = [var.get_shape().as_list() for var in var_list]
    # start = 0
    # tangents = []
    # for shape in shapes:
    #     sz = U.intprod(shape)
    #     tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
    #     start += sz
    # gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
    # fvp = U.flatgrad(gvp, var_list)

    # assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
    #                                                 for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    # compute_losses = U.function([ob, ac, atarg], losses)
    # compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    # compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    # compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    if rank == 0:
        generator_loss = tf.placeholder(tf.float32, [], name='generator_loss')
        expert_loss = tf.placeholder(tf.float32, [], name='expert_loss')
        entropy = tf.placeholder(tf.float32, [], name='entropy')
        entropy_loss = tf.placeholder(tf.float32, [], name='entropy_loss')
        generator_acc = tf.placeholder(tf.float32, [], name='genrator_acc')
        expert_acc = tf.placeholder(tf.float32, [], name='expert_acc')
        eplenmean = tf.placeholder(tf.int32, [], name='eplenmean')
        eprewmean = tf.placeholder(tf.float32, [], name='eprewmean')
        eptruerewmean = tf.placeholder(tf.float32, [], name='eptruerewmean')
        # _meankl = tf.placeholder(tf.float32, [], name='meankl')
        # _optimgain = tf.placeholder(tf.float32, [], name='optimgain')
        # _surrgain = tf.placeholder(tf.float32, [], name='surrgain')
        _ops_to_merge = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc, eplenmean, eprewmean, eptruerewmean]
        ops_to_merge = [ tf.summary.scalar(op.name, op) for op in _ops_to_merge]

        merged = tf.summary.merge(ops_to_merge)

    ### TODO: report these stats ### 
    #     generator_loss = tf.placeholder(tf.float32, [], name='generator_loss')
    #     expert_loss = tf.placeholder(tf.float32, [], name='expert_loss')
    #     generator_acc = tf.placeholder(tf.float32, [], name='genrator_acc')
    #     expert_acc = tf.placeholder(tf.float32, [], name='expert_acc')
    #     eplenmean = tf.placeholder(tf.int32, [], name='eplenmean')
    #     eprewmean = tf.placeholder(tf.float32, [], name='eprewmean')
    #     eptruerewmean = tf.placeholder(tf.float32, [], name='eptruerewmean')

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    adam.sync()
    d_adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, 
        mix_reward, r_lambda,
        stochastic=True, frame_stack=frame_stack)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=100)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])

    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    if rank == 0:
        filenames = [f for f in os.listdir(log_dir) if 'logs' in f]
        writer = tf.summary.FileWriter('{}/logs-{}'.format(log_dir, len(filenames)))

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)

            from tensorflow.core.protobuf import saver_pb2
            saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V1)
            saver.save(tf.get_default_session(), fname)
            # U.save_state(fname)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)

            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
            vpredbefore = seg["vpred"] # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]


            # # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            # ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
            # vpredbefore = seg["vpred"]  # predicted value function before udpate
            # atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy

            # args = seg["ob"], seg["ac"], atarg
            # fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new()  # set old parameter values to new parameter values

            with timed("policy optimization"):
                logger.log("Optimizing...")
                logger.log(fmt_row(13, loss_names))
                # Here we do a bunch of optimization epochs over the data
                for _ in range(optim_epochs):
                    losses = [] # list of tuples, each of which gives the loss for a minibatch
                    for batch in d.iterate_once(optim_batchsize):
                        *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                        adam.update(g, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)
                    logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))


        # g_losses = meanlosses
        # for (lossname, lossval) in zip(loss_names, meanlosses):
        #     logger.record_tabular(lossname, lossval)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))


        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = []  # list of tuples, each of which gives the loss for a minibatch

        for _ in range(optim_epochs // 10):
            for ob_batch, ac_batch in dataset.iterbatches((ob, ac),
                                                          include_final_partial_batch=False,
                                                          batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
                # update running mean/std for reward_giver
                ob_batch = ob_batch[:, -ob_expert.shape[1]:][:-30]
                if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)[:, :-30])
                # *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
                *newlosses, g = reward_giver.lossandgrad(ob_batch[:, :-30], ob_expert[:, :-30])
                d_adam.update(allmean(g), d_stepsize)
                d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))


        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0 and iters_so_far % 10 == 0:
            disc_losses = np.mean(d_losses, axis=0)
            res = tf.get_default_session().run(merged, feed_dict={
                generator_loss: disc_losses[0],
                expert_loss: disc_losses[1],
                entropy: disc_losses[2],
                entropy_loss: disc_losses[3],
                generator_acc: disc_losses[4],
                expert_acc: disc_losses[5],
                eplenmean: np.mean(lenbuffer),
                eprewmean: np.mean(rewbuffer),
                eptruerewmean: np.mean(true_rewbuffer),
            })
            writer.add_summary(res, iters_so_far)
            writer.flush()

        if rank == 0:
            logger.dump_tabular()