Exemple #1
0
    def policy_loss_ppo(self, pi, oldpi, ac, atarg, ret):
        kl_oldnew = oldpi.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        mean_kl = U.mean(kl_oldnew)
        mean_ent = U.mean(ent)
        pol_entpen = -self._entcoeff * mean_ent

        action_prob = pi.pd.logp(ac) - oldpi.pd.logp(ac)
        action_loss = tf.exp(action_prob) * atarg

        ratio = tf.exp(action_prob)

        surr1 = ratio * atarg
        surr2 = U.clip(ratio, 1.0 - self._clip_param,
                       1.0 + self._clip_param) * atarg
        pol_surr = -U.mean(tf.minimum(surr1, surr2))
        vf_loss = U.mean(tf.square(pi.vpred - ret))
        total_loss = pol_surr + pol_entpen + vf_loss

        losses = {
            'total_loss': total_loss,
            'action_loss': action_loss,
            'pol_surr': pol_surr,
            'pol_entpen': pol_entpen,
            'kl': mean_kl,
            'entropy': mean_ent,
            'vf_loss': vf_loss
        }
        return losses
Exemple #2
0
    def __init__(
        self,
        ob_space,
        ac_space,
        model_func,
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        adam_epsilon=1e-5,
    ):

        with tf.variable_scope('pi'):
            self.pi = pi = model_func(ob_space, ac_space)

        with tf.variable_scope('pi_old'):
            self.pi_old = pi_old = model_func(ob_space, ac_space)

        self.adv = tf.placeholder(
            dtype=tf.float32, shape=[None],
            name='adv')  # Target advantage function (if applicable)
        self.ret = tf.placeholder(dtype=tf.float32, shape=[None],
                                  name='ret')  # Empirical return

        self.lrmult = tf.placeholder(
            name='lrmult', dtype=tf.float32,
            shape=[])  # learning rate multiplier, updated with schedule
        clip_param = clip_param * self.lrmult  # Annealed cliping parameter epislon

        self.ac = ac = pi.pdtype.sample_placeholder([None])

        kloldnew = pi_old.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        meankl = U.mean(kloldnew)
        meanent = U.mean(ent)
        pol_entpen = (-entcoeff) * meanent

        ratio = tf.exp(pi.pd.logp(ac) - pi_old.pd.logp(ac))  # pnew / pold
        surr1 = ratio * self.adv  # surrogate from conservative policy iteration
        surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * self.adv  #
        pol_surr = -U.mean(tf.minimum(
            surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
        vf_loss = U.mean(tf.square(pi.vpred - self.ret))
        self.total_loss = pol_surr + pol_entpen + vf_loss

        # gradients
        self.grads = tf.gradients(self.total_loss, pi.train_vars)
        self.flat_grads = U.flatgrad(self.total_loss, pi.train_vars)

        # optimizer
        self.optimizer = MpiAdam(pi.train_vars, epsilon=adam_epsilon)

        # assign new pi to old pi
        self.op_assign_old_eq_new = tf.group(*[
            tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(pi_old.global_vars, pi.global_vars)
        ])

        U.initialize()
        self.optimizer.sync()
Exemple #3
0
def load_policy(env, policy_func, *,
                clip_param, entcoeff,  # clipping parameter epsilon, entropy coeff
                adam_epsilon=1e-5,
                model_path, checkpoint):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - U.mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    U.load_state(os.path.join(model_path, 'model-{}'.format(checkpoint)))

    return pi
Exemple #4
0
    def policy_loss_ppo(self, pi, oldpi, ac, atarg, ret, term=None, entcoeff=None):
        kl_oldnew = oldpi.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        mean_kl = U.mean(kl_oldnew)
        mean_ent = U.mean(ent)
        entcoeff = self._entcoeff if entcoeff is None else entcoeff
        logger.info('Policy {} entropy {}'.format(pi.name, entcoeff))
        pol_entpen = -entcoeff * mean_ent

        action_prob = pi.pd.logp(ac) - oldpi.pd.logp(ac)
        action_prob = tf.check_numerics(action_prob, 'check action_prob')
        action_loss = tf.check_numerics(atarg, 'check atarg')
        action_loss = tf.exp(action_prob) * atarg
        action_loss = tf.check_numerics(action_loss, 'check action_loss')

        term_loss = None
        if term is not None:
            # ignore prob of actions if term is True
            action_prob = (1 - tf.to_float(term)) * action_prob
            if pi.term_activation == 'sigmoid':
                term_prob = tf.log(pi.term_pred + 1e-5) - tf.clip_by_value(tf.log(oldpi.term_pred + 1e-5), -20, 20)
            else:
                term_prob = pi.term_pd.logp(term) - tf.clip_by_value(oldpi.term_pd.logp(term), -20, 20)
            action_prob += term_prob
            term_loss = tf.exp(term_prob) * atarg
        ratio = tf.exp(action_prob)

        surr1 = ratio * atarg
        surr2 = U.clip(ratio, 1.0 - self._clip_param, 1.0 + self._clip_param) * atarg
        pol_surr = -U.mean(tf.minimum(surr1, surr2))
        vf_loss = U.mean(tf.square(pi.vpred - ret))
        pol_surr = tf.check_numerics(pol_surr, 'check pol_surr')
        vf_loss = tf.check_numerics(vf_loss, 'check vf_loss')
        total_loss = pol_surr + pol_entpen + vf_loss

        total_loss = tf.check_numerics(total_loss, 'check total_loss')
        losses = {'total_loss': total_loss,
                  'action_loss': action_loss,
                  'pol_surr': pol_surr,
                  'pol_entpen': pol_entpen,
                  'kl': mean_kl,
                  'entropy': mean_ent,
                  'vf_loss': vf_loss}
        if term_loss is not None:
            losses.update({'term_loss': term_loss})
        return losses
Exemple #5
0
    def policy_loss_ppo_term(self, pi, oldpi, atarg, ret, term):
        if pi.term_type == 'sigmoid':
            term_prob = tf.log(pi.term_pred + 1e-5) - tf.clip_by_value(tf.log(oldpi.term_pred + 1e-5), -20, 20)
        else:
            term_prob = pi.term_pd.logp(term) - tf.clip_by_value(oldpi.term_pd.logp(term), -20, 20)
        term_loss = tf.exp(term_prob) * atarg
        ratio = tf.exp(term_prob)

        surr1 = ratio * atarg
        surr2 = U.clip(ratio, 1.0 - self._clip_param, 1.0 + self._clip_param) * atarg
        pol_surr = -U.mean(tf.minimum(surr1, surr2))
        vf_loss = U.mean(tf.square(pi.vpred - ret))
        pol_surr = tf.check_numerics(pol_surr, 'check pol_surr')
        vf_loss = tf.check_numerics(vf_loss, 'check vf_loss')
        total_loss = pol_surr + vf_loss

        total_loss = tf.check_numerics(total_loss, 'check total_loss')
        losses = {'total_loss': total_loss,
                  'pol_surr': pol_surr,
                  'vf_loss': vf_loss,
                  'term_loss': term_loss}
        return losses
Exemple #6
0
def learn(env, policy_func, *,
        timesteps_per_batch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint
        noisy_nets=False,
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        desired_kl=0.02,
        logdir=".",
        agentName="PPO-Agent",
        resume = 0,
        num_parallel=1,
        num_cpu=1
        ):
    # Setup losses and stuff
    # ----------------------------------------
    rank = MPI.COMM_WORLD.Get_rank()
    ob_space = env.observation_space
    ac_space = env.action_space

    ob_size = ob_space.shape[0]
    ac_size = ac_space.shape[0]

    #print("rank = " + str(rank) + " ob_space = "+str(ob_space.shape) + " ac_space = "+str(ac_space.shape))
    #exit(0)
    pi = policy_func("pi", ob_space, ac_space, noisy_nets) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space, noisy_nets) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vfloss1 = tf.square(pi.vpred - ret)
    vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param)
    vfloss2 = tf.square(vpredclipped - ret)
    vf_loss = .5 * U.mean(tf.maximum(vfloss1, vfloss2)) # we do the same clipping-based trust region for the value function
    #vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    if noisy_nets:
        stochastic = False
    else:
        stochastic = True
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=stochastic, num_parallel=num_parallel, num_cpu=num_cpu, rank=rank, ob_size=ob_size, ac_size=ac_size,com=MPI.COMM_WORLD)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    saver = tf.train.Saver()
    if resume > 0:
        saver.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume)))
    iters_so_far = resume
    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    logF = open(os.path.join(logdir, 'log.txt'), 'a')
    logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a')

    dump_training = 0
    learn_from_training = 0
    if dump_training:
        if os.path.exists(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl'):
            with open(logdir + "\\" +'ob_list_' + str(rank) + '.pkl', 'rb') as f:
                ob_list = pickle.load(f)
        else:
            ob_list = []

        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std
        saverRMS = tf.train.Saver({"_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count})
        saverRMS.save(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf"))

        ob_np_a = np.asarray(ob_list)
        ob_np = np.reshape(ob_np_a, (-1,ob_size))
        [vpred, pdparam] = pi._vpred_pdparam(ob_np)

        print("vpred = " + str(vpred))
        print("pd_param = " + str(pdparam))
        with open('training.pkl', 'wb') as f:
            pickle.dump(ob_np, f)
            pickle.dump(vpred, f)
            pickle.dump(pdparam, f)
        exit(0)

    if learn_from_training:
        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std
        with open('training.pkl', 'rb') as f:
            ob_np = pickle.load(f)
            vpred = pickle.load(f)
            pdparam = pickle.load(f)
        num = ob_np.shape[0]
        for i in range(num):
            xp = ob_np[i][1]
            ob_np[i][1] = 0.0
            ob_np[i][18] -= xp
            ob_np[i][22] -= xp
            ob_np[i][24] -= xp
            ob_np[i][26] -= xp
            ob_np[i][28] -= xp
            ob_np[i][30] -= xp
            ob_np[i][32] -= xp
            ob_np[i][34] -= xp
        print("ob_np = " + str(ob_np))
        print("vpred = " + str(vpred))
        print("pdparam = " + str(pdparam))
        batch_size = 128

        y_vpred = tf.placeholder(tf.float32, [batch_size, ])
        y_pdparam = tf.placeholder(tf.float32, [batch_size, pdparam.shape[1]])

        vpred_loss = U.mean(tf.square(pi.vpred - y_vpred))
        vpdparam_loss = U.mean(tf.square(pi.pdparam - y_pdparam))

        total_train_loss = vpred_loss + vpdparam_loss
        #total_train_loss = vpdparam_loss
        #total_train_loss = vpred_loss
        #coef = 0.01
        #dense_all = U.dense_all
        #for a in dense_all:
        #   total_train_loss += coef * tf.nn.l2_loss(a)
        #total_train_loss = vpdparam_loss
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(total_train_loss)
        d = Dataset(dict(ob=ob_np, vpred=vpred, pdparam=pdparam), shuffle=not pi.recurrent)
        sess = tf.get_default_session()
        sess.run(tf.global_variables_initializer())
        saverRMS = tf.train.Saver({"_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count})
        saverRMS.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf"))
        if resume > 0:
            saver.restore(tf.get_default_session(),
                          os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume)))
        for q in range(100):
            sumLoss = 0
            for batch in d.iterate_once(batch_size):
                tl, _ = sess.run([total_train_loss, optimizer], feed_dict={pi.ob: batch["ob"], y_vpred: batch["vpred"], y_pdparam:batch["pdparam"]})
                sumLoss += tl
            print("Iteration " + str(q)+ " Loss = " + str(sumLoss))
        assign_old_eq_new()  # set old parameter values to new parameter values

        # Save as frame 1
        try:
            saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=1)
        except:
            pass
        #exit(0)

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'adaptive' or 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0.0)
        elif schedule == 'linear_clipped':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0.2)
        elif schedule == 'cyclic':
        #    cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
            raise NotImplementedError
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam, timesteps_per_batch, num_parallel, num_cpu)
        #print(" ob= " + str(seg["ob"])+ " rew= " + str(seg["rew"])+ " vpred= " + str(seg["vpred"])+ " new= " + str(seg["new"])+ " ac= " + str(seg["ac"])+ " prevac= " + str(seg["prevac"])+ " nextvpred= " + str(seg["nextvpred"])+ " ep_rets= " + str(seg["ep_rets"])+ " ep_lens= " + str(seg["ep_lens"]))

        #exit(0)
        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]

        if dump_training:
            ob_list.append(ob.tolist())
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                if desired_kl != None and schedule == 'adaptive':
                    if newlosses[-2] > desired_kl * 2.0:
                        optim_stepsize = max(1e-8, optim_stepsize / 1.5)
                        print('kl divergence was too large = ', newlosses[-2])
                        print('New optim_stepsize = ', optim_stepsize)
                    elif newlosses[-2] < desired_kl / 2.0:
                        optim_stepsize = min(1e0, optim_stepsize * 1.5)
                        print('kl divergence was too small = ', newlosses[-2])
                        print('New optim_stepsize = ', optim_stepsize)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            #print(str(losses))
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        rewmean = np.mean(rewbuffer)
        logger.record_tabular("EpRewMean", rewmean)
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if dump_training:
            with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'wb') as f:
                pickle.dump(ob_list, f)

        if MPI.COMM_WORLD.Get_rank()==0:
            logF.write(str(rewmean) + "\n")
            logStats.write(logger.get_str() + "\n")
            logF.flush()
            logStats.flush()

            logger.dump_tabular()

            try:
                os.remove(logdir + "/checkpoint")
            except OSError:
                pass
            try:
                saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far)
            except:
                pass
Exemple #7
0
def learn(
        env,
        policy_func,
        disc,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        logdir=".",
        agentName="PPO-Agent",
        resume=0,
        num_parallel=0,
        num_cpu=1,
        num_extra=0,
        gan_batch_size=128,
        gan_num_epochs=5,
        gan_display_step=40,
        resume_disc=0,
        resume_non_disc=0,
        mocap_path="",
        gan_replay_buffer_size=1000000,
        gan_prob_to_put_in_replay=0.01,
        gan_reward_to_retrain_discriminator=5,
        use_distance=0,
        use_blend=0):
    # Deal with GAN
    if not use_distance:
        replay_buf = MyReplayBuffer(gan_replay_buffer_size)
    data = np.loadtxt(
        mocap_path + ".dat"
    )  #"D:/p4sw/devrel/libdev/flex/dev/rbd/data/bvh/motion_simple.dat");
    label = np.concatenate((np.ones(
        (data.shape[0], 1)), np.zeros((data.shape[0], 1))),
                           axis=1)

    print("Real data label = " + str(label))

    mocap_set = Dataset(dict(data=data, label=label), shuffle=True)

    # Setup losses and stuff
    # ----------------------------------------
    rank = MPI.COMM_WORLD.Get_rank()
    ob_space = env.observation_space
    ac_space = env.action_space

    ob_size = ob_space.shape[0]
    ac_size = ac_space.shape[0]

    #print("rank = " + str(rank) + " ob_space = "+str(ob_space.shape) + " ac_space = "+str(ac_space.shape))
    #exit(0)
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vfloss1 = tf.square(pi.vpred - ret)
    vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred,
                                                  -clip_param, clip_param)
    vfloss2 = tf.square(vpredclipped - ret)
    vf_loss = .5 * U.mean(
        tf.maximum(vfloss1, vfloss2)
    )  # we do the same clipping-based trust region for the value function
    #vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    sess = tf.get_default_session()

    avars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    non_disc_vars = [
        a for a in avars
        if not a.name.split("/")[0].startswith("discriminator")
    ]
    disc_vars = [
        a for a in avars if a.name.split("/")[0].startswith("discriminator")
    ]
    #print(str(non_disc_names))
    #print(str(disc_names))
    #exit(0)
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    disc_saver = tf.train.Saver(disc_vars, max_to_keep=None)
    non_disc_saver = tf.train.Saver(non_disc_vars, max_to_keep=None)
    saver = tf.train.Saver(max_to_keep=None)
    if resume > 0:
        saver.restore(
            tf.get_default_session(),
            os.path.join(os.path.abspath(logdir),
                         "{}-{}".format(agentName, resume)))
        if not use_distance:
            if os.path.exists(logdir + "\\" + 'replay_buf_' +
                              str(int(resume / 100) * 100) + '.pkl'):
                print("Load replay buf")
                with open(
                        logdir + "\\" + 'replay_buf_' +
                        str(int(resume / 100) * 100) + '.pkl', 'rb') as f:
                    replay_buf = pickle.load(f)
            else:
                print("Can't load replay buf " + logdir + "\\" +
                      'replay_buf_' + str(int(resume / 100) * 100) + '.pkl')
    iters_so_far = resume

    if resume_non_disc > 0:
        non_disc_saver.restore(
            tf.get_default_session(),
            os.path.join(
                os.path.abspath(logdir),
                "{}-{}".format(agentName + "_non_disc", resume_non_disc)))
        iters_so_far = resume_non_disc

    if use_distance:
        print("Use distance")
        nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(data)
    else:
        nn = None
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     disc,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_parallel=num_parallel,
                                     num_cpu=num_cpu,
                                     rank=rank,
                                     ob_size=ob_size,
                                     ac_size=ac_size,
                                     com=MPI.COMM_WORLD,
                                     num_extra=num_extra,
                                     iters_so_far=iters_so_far,
                                     use_distance=use_distance,
                                     nn=nn)

    if resume_disc > 0:
        disc_saver.restore(
            tf.get_default_session(),
            os.path.join(os.path.abspath(logdir),
                         "{}-{}".format(agentName + "_disc", resume_disc)))

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"
    logF = open(logdir + "\\" + 'log.txt', 'a')
    logR = open(logdir + "\\" + 'log_rew.txt', 'a')
    logStats = open(logdir + "\\" + 'log_stats.txt', 'a')
    if os.path.exists(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl'):
        with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'rb') as f:
            ob_list = pickle.load(f)
    else:
        ob_list = []

    dump_training = 0
    learn_from_training = 0
    if dump_training:
        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std
        saverRMS = tf.train.Saver({
            "_sum": pi.ob_rms._sum,
            "_sumsq": pi.ob_rms._sumsq,
            "_count": pi.ob_rms._count
        })
        saverRMS.save(tf.get_default_session(),
                      os.path.join(os.path.abspath(logdir), "rms.tf"))

        ob_np_a = np.asarray(ob_list)
        ob_np = np.reshape(ob_np_a, (-1, ob_size))
        [vpred, pdparam] = pi._vpred_pdparam(ob_np)

        print("vpred = " + str(vpred))
        print("pd_param = " + str(pdparam))
        with open('training.pkl', 'wb') as f:
            pickle.dump(ob_np, f)
            pickle.dump(vpred, f)
            pickle.dump(pdparam, f)
        exit(0)
    if learn_from_training:
        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std

        with open('training.pkl', 'rb') as f:
            ob_np = pickle.load(f)
            vpred = pickle.load(f)
            pdparam = pickle.load(f)
        num = ob_np.shape[0]
        for i in range(num):
            xp = ob_np[i][1]
            ob_np[i][1] = 0.0
            ob_np[i][18] -= xp
            ob_np[i][22] -= xp
            ob_np[i][24] -= xp
            ob_np[i][26] -= xp
            ob_np[i][28] -= xp
            ob_np[i][30] -= xp
            ob_np[i][32] -= xp
            ob_np[i][34] -= xp
        print("ob_np = " + str(ob_np))
        print("vpred = " + str(vpred))
        print("pdparam = " + str(pdparam))
        batch_size = 128

        y_vpred = tf.placeholder(tf.float32, [
            batch_size,
        ])
        y_pdparam = tf.placeholder(tf.float32, [batch_size, pdparam.shape[1]])

        vpred_loss = U.mean(tf.square(pi.vpred - y_vpred))
        vpdparam_loss = U.mean(tf.square(pi.pdparam - y_pdparam))

        total_train_loss = vpred_loss + vpdparam_loss
        #total_train_loss = vpdparam_loss
        #total_train_loss = vpred_loss
        #coef = 0.01
        #dense_all = U.dense_all
        #for a in dense_all:
        #   total_train_loss += coef * tf.nn.l2_loss(a)
        #total_train_loss = vpdparam_loss
        optimizer = tf.train.AdamOptimizer(
            learning_rate=0.001).minimize(total_train_loss)
        d = Dataset(dict(ob=ob_np, vpred=vpred, pdparam=pdparam),
                    shuffle=not pi.recurrent)
        sess = tf.get_default_session()
        sess.run(tf.global_variables_initializer())
        saverRMS = tf.train.Saver({
            "_sum": pi.ob_rms._sum,
            "_sumsq": pi.ob_rms._sumsq,
            "_count": pi.ob_rms._count
        })
        saverRMS.restore(tf.get_default_session(),
                         os.path.join(os.path.abspath(logdir), "rms.tf"))
        if resume > 0:
            saver.restore(
                tf.get_default_session(),
                os.path.join(os.path.abspath(logdir),
                             "{}-{}".format(agentName, resume)))

        for q in range(100):
            sumLoss = 0
            for batch in d.iterate_once(batch_size):
                tl, _ = sess.run(
                    [total_train_loss, optimizer],
                    feed_dict={
                        pi.ob: batch["ob"],
                        y_vpred: batch["vpred"],
                        y_pdparam: batch["pdparam"]
                    })
                sumLoss += tl
            print("Iteration " + str(q) + " Loss = " + str(sumLoss))
        assign_old_eq_new()  # set old parameter values to new parameter values

        # Save as frame 1
        try:
            saver.save(tf.get_default_session(),
                       os.path.join(logdir, agentName),
                       global_step=1)
        except:
            pass
        #exit(0)
    if resume > 0:
        firstTime = False
    else:
        firstTime = True

    # Check accuracy
    #amocap = sess.run([disc.accuracy],
    #                feed_dict={disc.input: data,
    #                           disc.label: label})
    #print("Mocap accuracy = " + str(amocap))
    #print("Mocap label is " + str(label))

    #adata = np.array(replay_buf._storage)
    #print("adata shape = " + str(adata.shape))
    #alabel = np.concatenate((np.zeros((adata.shape[0], 1)), np.ones((adata.shape[0], 1))), axis=1)

    #areplay = sess.run([disc.accuracy],
    #                feed_dict={disc.input: adata,
    #                           disc.label: alabel})
    #print("Replay accuracy = " + str(areplay))
    #print("Replay label is " + str(alabel))
    #exit(0)
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam, timesteps_per_batch, num_parallel,
                          num_cpu)
        #print(" ob= " + str(seg["ob"])+ " rew= " + str(seg["rew"])+ " vpred= " + str(seg["vpred"])+ " new= " + str(seg["new"])+ " ac= " + str(seg["ac"])+ " prevac= " + str(seg["prevac"])+ " nextvpred= " + str(seg["nextvpred"])+ " ep_rets= " + str(seg["ep_rets"])+ " ep_lens= " + str(seg["ep_lens"]))

        #exit(0)
        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret, extra = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"], seg["extra"]

        #ob_list.append(ob.tolist())
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            #print(str(losses))
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        rewmean = np.mean(rewbuffer)
        logger.record_tabular("EpRewMean", rewmean)
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        # Train discriminator
        if not use_distance:
            print("Put in replay buf " +
                  str((int)(gan_prob_to_put_in_replay * extra.shape[0] + 1)))
            replay_buf.add(extra[np.random.choice(
                extra.shape[0],
                (int)(gan_prob_to_put_in_replay * extra.shape[0] + 1),
                replace=True)])
            #if iters_so_far == 1:
            if not use_blend:
                if firstTime:
                    firstTime = False
                    # Train with everything we got
                    lb = np.concatenate((np.zeros(
                        (extra.shape[0], 1)), np.ones((extra.shape[0], 1))),
                                        axis=1)
                    extra_set = Dataset(dict(data=extra, label=lb),
                                        shuffle=True)
                    for e in range(10):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            batch = extra_set.next_batch(gan_batch_size)
                            _, l = sess.run(
                                [disc.optimizer_first, disc.loss],
                                feed_dict={
                                    disc.input:
                                    np.concatenate(
                                        (mbatch['data'], batch['data'])),
                                    disc.label:
                                    np.concatenate(
                                        (mbatch['label'], batch['label']))
                                })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))
                if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator:
                    for e in range(gan_num_epochs):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            data = replay_buf.sample(mbatch['data'].shape[0])
                            lb = np.concatenate((np.zeros(
                                (data.shape[0], 1)), np.ones(
                                    (data.shape[0], 1))),
                                                axis=1)
                            _, l = sess.run(
                                [disc.optimizer, disc.loss],
                                feed_dict={
                                    disc.input:
                                    np.concatenate((mbatch['data'], data)),
                                    disc.label:
                                    np.concatenate((mbatch['label'], lb))
                                })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))
            else:
                if firstTime:
                    firstTime = False
                    # Train with everything we got
                    extra_set = Dataset(dict(data=extra), shuffle=True)
                    for e in range(10):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            batch = extra_set.next_batch(gan_batch_size)
                            bf = np.random.uniform(0, 1, (gan_batch_size, 1))
                            onembf = 1 - bf
                            my_label = np.concatenate((bf, onembf), axis=1)
                            my_data = np.multiply(mbatch['data'],
                                                  bf) + np.multiply(
                                                      batch['data'], onembf)
                            _, l = sess.run([disc.optimizer_first, disc.loss],
                                            feed_dict={
                                                disc.input: my_data,
                                                disc.label: my_label
                                            })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))
                if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator:
                    for e in range(gan_num_epochs):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            data = replay_buf.sample(mbatch['data'].shape[0])

                            bf = np.random.uniform(0, 1, (gan_batch_size, 1))
                            onembf = 1 - bf
                            my_label = np.concatenate((bf, onembf), axis=1)
                            my_data = np.multiply(mbatch['data'],
                                                  bf) + np.multiply(
                                                      data, onembf)

                            _, l = sess.run([disc.optimizer_first, disc.loss],
                                            feed_dict={
                                                disc.input: my_data,
                                                disc.label: my_label
                                            })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))

        # if True:
        #     lb = np.concatenate((np.zeros((extra.shape[0],1)),np.ones((extra.shape[0],1))),axis=1)
        #     extra_set = Dataset(dict(data=extra,label=lb), shuffle=True)
        #     num_r = 1
        #     if iters_so_far == 1:
        #         num_r = gan_num_epochs
        #     for e in range(num_r):
        #         i = 0
        #         for batch in extra_set.iterate_once(gan_batch_size):
        #             mbatch = mocap_set.next_batch(gan_batch_size)
        #             _, l = sess.run([disc.optimizer, disc.loss], feed_dict={disc.input: np.concatenate((mbatch['data'],batch['data'])), disc.label: np.concatenate((mbatch['label'],batch['label']))})
        #             i = i + 1
        #             # Display logs per step
        #             if i % gan_display_step == 0 or i == 1:
        #                 print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l))
        #         print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l))

        if not use_distance:
            if iters_so_far % 100 == 0:
                with open(
                        logdir + "\\" + 'replay_buf_' + str(iters_so_far) +
                        '.pkl', 'wb') as f:
                    pickle.dump(replay_buf, f)

        with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'wb') as f:
            pickle.dump(ob_list, f)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logF.write(str(rewmean) + "\n")
            logR.write(str(seg['mean_ext_rew']) + "\n")
            logStats.write(logger.get_str() + "\n")
            logF.flush()
            logStats.flush()
            logR.flush()

            logger.dump_tabular()

            try:
                os.remove(logdir + "/checkpoint")
            except OSError:
                pass
            try:
                saver.save(tf.get_default_session(),
                           os.path.join(logdir, agentName),
                           global_step=iters_so_far)
            except:
                pass
            try:
                non_disc_saver.save(tf.get_default_session(),
                                    os.path.join(logdir,
                                                 agentName + "_non_disc"),
                                    global_step=iters_so_far)
            except:
                pass
            try:
                disc_saver.save(tf.get_default_session(),
                                os.path.join(logdir, agentName + "_disc"),
                                global_step=iters_so_far)
            except:
                pass
Exemple #8
0
def learn(env, policy_func, *,
        timesteps_per_actorbatch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant' # annealing for stepsize parameters (epsilon and adam)
        ):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    #seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    while True:
        data_path = '/Users/wjh720/Desktop/Tmp/para_%i/' % (timesteps_per_actorbatch / 100)
        U.load_state(data_path + 'para')

        test(pi, env, timesteps_per_actorbatch, stochastic=True)
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        num_options=1,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.seed(seed)

    ### Book-keeping
    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app
    version_name = 'FINAL_NORM-ACT-LOWER-LR-len-400-wNoise-update1-ppo-ESCH-1-2-5-nI'

    dirname = '{}_{}_{}opts_saves/'.format(version_name, gamename, num_options)
    print(dirname)
    #input ("wait here after dirname")

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py']
        first = True
        for i in range(len(files)):
            src = os.path.join(
                '/home/nfunk/Code_MA/ppoc_off_tryout/baselines/baselines/ppo1/'
            ) + files[i]
            print(src)
            #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname
            dest = dirname + "src_code/"
            if (first):
                os.makedirs(dest)
                first = False
            print(dest)
            shutil.copy2(src, dest)
        # brute force copy normal env file at end of copying process:
        src = os.path.join(
            '/home/nfunk/Code_MA/ppoc_off_tryout/nfunk/envs_nf/pendulum_nf.py')
        shutil.copy2(src, dest)
    ###

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    max_action = env.action_space.high

    # add the dimension in the observation space!
    ob_space.shape = ((ob_space.shape[0] + ac_space.shape[0]), )
    print(ob_space.shape)
    print(ac_space.shape)
    #input ("wait here where the spaces are printed!!!")
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    pol_ov_op_ent = tf.placeholder(dtype=tf.float32,
                                   shape=None)  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # pdb.set_trace()
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    atarg_clip = atarg  #tf.clip_by_value(atarg,-10,10)
    surr1 = ratio * atarg_clip  #atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param,
                   1.0 + clip_param) * atarg_clip  #atarg #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    #vf_loss = U.mean(tf.square(tf.clip_by_value(pi.vpred - ret, -10.0, 10.0)))
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    force_pi_loss = U.mean(
        tf.square(
            tf.clip_by_value(pi.op_pi, 1e-5, 1.0) -
            tf.constant([[0.05, 0.95]])))

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0))
    #log_pi = tf.Print(log_pi, [log_pi, tf.shape(tf.transpose(log_pi))])
    old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0))
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)

    ratio_pol_ov_op = tf.exp(
        tf.transpose(log_pi)[option[0]] -
        tf.transpose(old_log_pi)[option[0]])  # pnew / pold
    term_adv_clip = term_adv  #tf.clip_by_value(term_adv,-10,10)
    surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip  # surrogate from conservative policy iteration
    surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param,
                             1.0 + clip_param) * term_adv_clip  #
    pol_surr_pol_ov_op = -U.mean(
        tf.minimum(surr1_pol_ov_op,
                   surr2_pol_ov_op))  # PPO's pessimistic surrogate (L^CLIP)

    op_loss = pol_surr_pol_ov_op - pol_ov_op_ent * tf.reduce_sum(entropy)
    #op_loss = pol_surr_pol_ov_op

    #total_loss += force_pi_loss
    total_loss += op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    lossandgrad = U.function(
        [ob, ac, atarg, ret, lrmult, option, term_adv, pol_ov_op_ent],
        losses + [U.flatgrad(total_loss, var_list)])
    termloss = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    saver = tf.train.Saver(max_to_keep=10000)
    saver_best = tf.train.Saver(max_to_keep=1)

    ### More book-kepping
    results = []
    if saves:
        results = open(
            version_name + '_' + gamename + '_' + str(num_options) + 'opts_' +
            '_results.csv', 'w')
        results_best_model = open(
            dirname + version_name + '_' + gamename + '_' + str(num_options) +
            'opts_' + '_bestmodel.csv', 'w')

        out = 'epoch,avg_reward'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)
        out += '\n'
        results.write(out)
        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch >= 0:

        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)
    ###

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    des_pol_op_ent = 0.1
    max_val = -100000
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(
                seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))
        print("mean opt dur:", opt_d)
        print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']), axis=0))

        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        if hasattr(pi, "ob_rms_only"):
            pi.ob_rms_only.update(ob[:, :-ac_space.shape[0]]
                                  )  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if (iters_so_far + 1) % 1000 == 0:
            des_pol_op_ent = des_pol_op_ent / 10

        if iters_so_far % 50 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        # adaptively save best run:
        if (np.mean(rewbuffer) > max_val) and wsaves:
            max_val = np.mean(rewbuffer)
            results_best_model.write('epoch: ' + str(iters_so_far) + 'rew: ' +
                                     str(np.mean(rewbuffer)) + '\n')
            results_best_model.flush()
            filename = dirname + 'best.ckpt'.format(gamename, iters_so_far)
            save_path = saver_best.save(U.get_session(), filename)

        min_batch = 160  # Arbitrary
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n
                      < min_batch) or (indices.size > min_batch
                                       and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
            ###

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    #tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt])
                    tadv, nodc_adv = pi.get_opt_adv(batch["ob"], [opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    #if (opt==1):
                    #    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    #else:
                    #    *newlosses, grads = lossandgrad0(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult,
                                                    [opt], tadv,
                                                    des_pol_op_ent)
                    #*newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    #termg = termloss(batch["ob"], [opt], tadv)
                    #adam.update(termg[0], 5e-7 * cur_lrmult)
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            for _ in range(num_options):
                out += ",{},{},{},{}"
            out += "\n"

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options):
                info.append(opt_d[i])
            for i in range(num_options):
                info.append(std[i])
            for i in range(num_options):
                info.append(np.mean(np.array(seg['term_p']), axis=0)[i])
            for i in range(num_options):
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
Exemple #10
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    rollouts_time = 0
    optimization_time = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        a = time.time()

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        b = time.time()
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        grad_time = 0.0
        allreduce_time = 0.0
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                aa = time.time()
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                bb = time.time()
                adam.update(g, optim_stepsize * cur_lrmult)
                cc = time.time()
                grad_time += bb - aa
                allreduce_time += cc - bb
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("GradTime", grad_time)
        logger.record_tabular("AllReduceTime", allreduce_time)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        c = time.time()
        rollouts_time += (b - a)
        optimization_time += (c - b)
        logger.record_tabular("RolloutsTime", rollouts_time)
        logger.record_tabular("OptimizationTime", optimization_time)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
def learn(env, policy_func, *,
        timesteps_per_batch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        num_options=1,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0
        ):


    optim_batchsize_ideal = optim_batchsize 
    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.seed(seed)

    ### Book-keeping
    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app
    # This variable: "version name, defines the name of the training"
    version_name = '25er_alternation_SEPARATE_optimization-ppo-ESCH-1-0-0-nI' 

    dirname = '{}_{}_{}opts_saves/'.format(version_name,gamename,num_options)
    print (dirname)

    # retrieve everything using relative paths. Create a train_results folder where the repo has been cloned
    dirname_rel = os.path.dirname(__file__)
    splitted = dirname_rel.split("/")
    dirname_rel = ("/".join(dirname_rel.split("/")[:len(splitted)-3])+"/")
    dirname = dirname_rel + "train_results/" + dirname

    # if saving -> create the necessary directories
    if wsaves:
        first=True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False

        # copy also the original files into the folder where the training results are stored

        files = ['pposgd_simple.py','mlp_policy.py','run_mujoco.py']
        first = True
        for i in range(len(files)):
            src = os.path.join(dirname_rel,'baselines/baselines/ppo1/') + files[i]
            print (src)
            #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname
            dest = dirname + "src_code/"
            if (first):
                os.makedirs(dest)
                first = False
            print (dest)
            shutil.copy2(src,dest)
        # brute force copy normal env file at end of copying process:
        src = os.path.join(dirname_rel,'nfunk/envs_nf/pendulum_nf.py')          
        shutil.copy2(src,dest)
        shutil.copy2(src,dest)
        os.makedirs(dest+"assets/")
        src = os.path.join(dirname_rel,'nfunk/envs_nf/assets/clockwise.png')
        shutil.copy2(src,dest+"assets/")
    ###


    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    max_action = env.action_space.high

    # add the dimension in the observation space!
    ob_space.shape =((ob_space.shape[0] + ac_space.shape[0]),)
    print (ob_space.shape)
    print (ac_space.shape)

    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function 
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
    pol_ov_op_ent = tf.placeholder(dtype=tf.float32, shape=None) # Entropy coefficient for policy over options


    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon for PPO


    # setup observation, option and terminal advantace
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None])

    # create variable for action
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    # propability of choosing action under new policy vs old policy (PPO)
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) 
    # advantage of choosing the action
    atarg_clip = atarg
    # surrogate 1:
    surr1 = ratio * atarg_clip #atarg # surrogate from conservative policy iteration
    # surrogate 2:
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_clip 
    # PPO's pessimistic surrogate (L^CLIP)
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) 

    # Loss on the Q-function
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    # calculate the total loss
    total_loss = vf_loss
    intra_op = pol_surr 
    
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    # calculate logarithm of propability of policy over options
    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0))
    # calculate logarithm of propability of policy over options old parameter
    old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0))
    # calculate entropy of policy over options
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)

    # calculate the ppo update for the policy over options:
    ratio_pol_ov_op = tf.exp(tf.transpose(log_pi)[option[0]] - tf.transpose(old_log_pi)[option[0]]) # pnew / pold
    term_adv_clip = term_adv 
    surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip # surrogate from conservative policy iteration
    surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param, 1.0 + clip_param) * term_adv_clip #
    pol_surr_pol_ov_op = - U.mean(tf.minimum(surr1_pol_ov_op, surr2_pol_ov_op)) # PPO's pessimistic surrogate (L^CLIP)
    
    op_loss = pol_surr_pol_ov_op - pol_ov_op_ent*tf.reduce_sum(entropy)

    # add loss of policy over options to total loss
    #total_loss += op_loss
    total_loss1 = total_loss + intra_op
    total_loss2 = total_loss + op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    # define function that we will later do gradient descent on
    lossandgrad1 = U.function([ob, ac, atarg, ret, lrmult,option, term_adv,pol_ov_op_ent], losses + [U.flatgrad(total_loss1, var_list)])
    lossandgrad2 = U.function([ob, ac, atarg, ret, lrmult,option, term_adv,pol_ov_op_ent], losses + [U.flatgrad(total_loss2, var_list)])
    
    # define adam optimizer
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    # define function that will assign the current parameters to the old policy
    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)


    U.initialize()
    adam.sync()


    # NOW: everything for training was defined, from here on we start with the execution:

    # initialize "savers" which will store the results
    saver = tf.train.Saver(max_to_keep=10000)
    saver_best = tf.train.Saver(max_to_keep=1)


    ### Define the names of the .csv files that are going to be stored
    results=[]
    if saves:
        results = open(dirname + version_name + '_' + gamename +'_'+str(num_options)+'opts_'+'_results.csv','w')
        results_best_model = open(dirname + version_name + '_' + gamename +'_'+str(num_options)+'opts_'+'_bestmodel.csv','w')


        out = 'epoch,avg_reward'

        for opt in range(num_options): out += ',option {} dur'.format(opt)
        for opt in range(num_options): out += ',option {} std'.format(opt)
        for opt in range(num_options): out += ',option {} term'.format(opt)
        for opt in range(num_options): out += ',option {} adv'.format(opt)
        out+='\n'
        results.write(out)
        results.flush()

    # speciality: if running the training with epoch argument -> a model is loaded
    if epoch >= 0:
        
        dirname = '{}_{}opts_saves/'.format(gamename,num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename,epoch)
        saver.restore(U.get_session(),filename)
    ###    


    # start training
    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    alternating_frequency = 25 # defines after how many epochs we switch optimizing between control and communication
    des_pol_op_ent = 0.1    # define policy over options entropy scheduling
    max_val = -100000       # define max_val, this will be updated to always store the best model
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options,saves=saves,results=results,rewbuffer=rewbuffer,dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        # Sample (s,a)-Transitions
        seg = seg_gen.__next__()
        # Calculate A(s,a,o) using GAE
        add_vtarg_and_adv(seg, gamma, lam)


        # calculate information for logging
        opt_d = []
        for i in range(num_options):
            dur = np.mean(seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))
        print("mean opt dur:", opt_d)             
        print("mean op pol:", np.mean(np.array(seg['optpol_p']),axis=0))         
        print("mean term p:", np.mean(np.array(seg['term_p']),axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']),axis=0))
       

        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
        if hasattr(pi, "ob_rms_only"): pi.ob_rms_only.update(ob[:,:-ac_space.shape[0]]) # update running mean/std for policy
        assign_old_eq_new() # set old parameter values to new parameter values

        # if iterations modulo 1000 -> adapt entropy scheduling coefficient
        #if ((iters_so_far+1)%1000 and (iters_so_far+1)>=2000) == 0:
        if ((iters_so_far+1)%1000) == 0:
            des_pol_op_ent = des_pol_op_ent/10

        # every 50 epochs save the best model
        if iters_so_far % 50 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(gamename,iters_so_far)
            save_path = saver.save(U.get_session(),filename)

        # adaptively save best model -> if current reward is highest, save the model
        if (np.mean(rewbuffer)>max_val) and wsaves:
            max_val = np.mean(rewbuffer)
            results_best_model.write('epoch: '+str(iters_so_far) + 'rew: ' + str(np.mean(rewbuffer)) + '\n')
            results_best_model.flush()
            filename = dirname + 'best.ckpt'.format(gamename,iters_so_far)
            save_path = saver_best.save(U.get_session(),filename)



        # minimum batch size:
        min_batch=160 
        t_advs = [[] for _ in range(num_options)]
        
        # select all the samples concering one of the options
        # Note: so far the update is that we first use all samples from option 0 to update, then we use all samples from option 1 to update
        for opt in range(num_options):
            indices = np.where(opts==opt)[0]
            print("batch size:",indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue


            ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'],ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'],ac[indices]))
                    cat_atarg = np.concatenate((oldmap['atarg'],atarg[indices]))
                    cat_vtarg = np.concatenate((oldmap['vtarg'],tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'],ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'],ac[indices]))
                    cat_atarg = np.concatenate((oldmap['atarg'],atarg[indices]))
                    cat_vtarg = np.concatenate((oldmap['vtarg'],tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)
            ###


            # define the batchsize of the optimizer:
            optim_batchsize = optim_batchsize or ob.shape[0]
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")


            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [] # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    # Calculate advantage for using specific option here
                    tadv,nodc_adv = pi.get_opt_adv(batch["ob"],[opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    # calculate the gradient
                    #VAR 1:
                    #if ((iters_so_far+1)>=2000):
                    #    *newlosses, grads = lossandgrad2(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)
                    #else:
                    #    *newlosses, grads = lossandgrad1(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)

                    if (int((iters_so_far)/alternating_frequency)%2==1):
                        *newlosses, grads = lossandgrad2(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)
                    else:
                        #print ("optim comm always")
                        *newlosses, grads = lossandgrad1(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)

                    # perform gradient update
                    adam.update(grads, optim_stepsize * cur_lrmult) 
                    losses.append(newlosses)


        # do logging:
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            for _ in range(num_options): out+=",{},{},{},{}"
            out+="\n"
            

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options): info.append(opt_d[i])
            for i in range(num_options): info.append(std[i])
            for i in range(num_options): info.append(np.mean(np.array(seg['term_p']),axis=0)[i])
            for i in range(num_options): 
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
Exemple #12
0
def learn(
        env,
        policy_func,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        load_model=None,
        action_bias=0.4,
        action_repeat=0,
        action_repeat_rand=False,
        warmup_frames=0,
        target_kl=0.01,
        vf_loss_mult=1,
        vfloss_optim_stepsize=0.003,
        vfloss_optim_batchsize=8,
        vfloss_optim_epochs=10):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    # Not sure why they anneal clip and learning rate with the same parameter
    #clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen
    losses = [pol_surr, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    lossandgrad_vfloss = U.function([ob, ac, atarg, ret], [vf_loss] +
                                    [U.flatgrad(vf_loss, var_list)])
    adam_vfloss = MpiAdam(var_list, epsilon=adam_epsilon)
    compute_vfloss = U.function([ob, ac, atarg, ret], [vf_loss])

    U.initialize()
    adam.sync()
    adam_vfloss.sync()

    if load_model:
        logger.log('Loading model: %s' % load_model)
        pi.load(load_model)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     action_bias=action_bias,
                                     action_repeat=action_repeat,
                                     action_repeat_rand=action_repeat_rand,
                                     warmup_frames=warmup_frames)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    ep_rew_file = None
    if MPI.COMM_WORLD.Get_rank() == 0:
        import wandb
        ep_rew_file = open(
            os.path.join(wandb.run.dir, 'episode_rewards.jsonl'), 'w')
        checkpoint_dir = 'checkpoints-%s' % wandb.run.id
        os.mkdir(checkpoint_dir)

    cur_lrmult = 1.0
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        elif schedule == 'target_kl':
            pass
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.next()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                result = lossandgrad(batch["ob"], batch["ac"], batch["atarg"],
                                     batch["vtarg"], cur_lrmult)
                newlosses = result[:-1]
                g = result[-1]
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        # vfloss optimize
        logger.log("Optimizing value function...")
        logger.log(fmt_row(13, ['vf']))
        for _ in range(vfloss_optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(vfloss_optim_batchsize):
                result = lossandgrad_vfloss(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"])
                newlosses = result[:-1]
                g = result[-1]
                adam_vfloss.update(g, vfloss_optim_stepsize)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            newlosses += compute_vfloss(batch["ob"], batch["ac"],
                                        batch["atarg"], batch["vtarg"])
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names + ['vf']):
            logger.record_tabular("loss_" + name, lossval)
        # check kl
        if schedule == 'target_kl':
            if meanlosses[2] > target_kl * 1.1:
                cur_lrmult /= 1.5
            elif meanlosses[2] < target_kl / 1.1:
                cur_lrmult *= 1.5
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        if rewbuffer:
            logger.record_tabular('CurLrMult', cur_lrmult)
            logger.record_tabular('StepSize', optim_stepsize * cur_lrmult)
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMax", np.max(rewbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpRewMin", np.min(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            time_elapsed = time.time() - tstart
            logger.record_tabular("TimeElapsed", time_elapsed)
            if MPI.COMM_WORLD.Get_rank() == 0:
                import wandb
                ep_rew_file.write('%s\n' % json.dumps({
                    'TimeElapsed': time_elapsed,
                    'Rewards': rews
                }))
                ep_rew_file.flush()
                data = logger.Logger.CURRENT.name2val
                wandb.run.history.add(data)
                summary_data = {}
                for k, v in data.iteritems():
                    if 'Rew' in k:
                        summary_data[k] = v
                wandb.run.summary.update(summary_data)
                pi.save(
                    os.path.join(checkpoint_dir,
                                 'model-%s.ckpt' % (iters_so_far - 1)))

                logger.dump_tabular()
        else:
            logger.log('No episodes complete yet')
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        log_every=None,
        log_dir=None,
        episodes_so_far=0,
        timesteps_so_far=0,
        iters_so_far=0,
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        **kwargs):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    # Target advantage function (if applicable)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # learning rate multiplier, updated with schedule
    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    # GRASPING
    saver = tf.train.Saver(var_list=U.ALREADY_INITIALIZED, max_to_keep=1)
    checkpoint = tf.train.latest_checkpoint(log_dir)
    if checkpoint:
        print("Restoring checkpoint: {}".format(checkpoint))
        saver.restore(U.get_session(), checkpoint)
    if hasattr(env, "set_actor"):

        def actor(obs):
            return pi.act(False, obs)[0]

        env.set_actor(actor)
    if not checkpoint and hasattr(env, "warm_init_eps"):
        pretrain(pi, env)
        saver.save(U.get_session(), osp.join(log_dir, "model"))
    # /GRASPING
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    tstart = time.time()

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback:
            callback(locals(), globals())
        should_break = False
        if max_timesteps and timesteps_so_far >= max_timesteps:
            should_break = True
        elif max_episodes and episodes_so_far >= max_episodes:
            should_break = True
        elif max_iters and iters_so_far >= max_iters:
            should_break = True
        elif max_seconds and time.time() - tstart >= max_seconds:
            should_break = True

        if log_every and log_dir:
            if (iters_so_far + 1) % log_every == 0 or should_break:
                # To reduce space, don't specify global step.
                saver.save(U.get_session(), osp.join(log_dir, "model"))

            job_info = {
                'episodes_so_far': episodes_so_far,
                'iters_so_far': iters_so_far,
                'timesteps_so_far': timesteps_so_far
            }
            with open(osp.join(log_dir, "job_info_new.yaml"), 'w') as file:
                yaml.dump(job_info, file, default_flow_style=False)
                # Make sure write is instantaneous.
                file.flush()
                os.fsync(file)
            os.rename(osp.join(log_dir, "job_info_new.yaml"),
                      osp.join(log_dir, "job_info.yaml"))

        if should_break:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (
            atarg.std() + 1e-10)  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            # logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        logger.record_tabular("EpLenMean", np.mean(lens))
        logger.record_tabular("EpRewMean", np.mean(rews))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
Exemple #14
0
    name='lrmult', dtype=tf.float32,
    shape=[])  # learning rate multiplier, updated with schedule
clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

ob = U.get_placeholder_cached(name="ob")
ac = pi.pdtype.sample_placeholder([None])

kloldnew = oldpi.pd.kl(pi.pd)
ent = pi.pd.entropy()
meankl = U.mean(kloldnew)
meanent = U.mean(ent)
pol_entpen = (-entcoeff) * meanent

ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
surr1 = ratio * atarg  # surrogate from conservative policy iteration
surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
pol_surr = -U.mean(tf.minimum(surr1,
                              surr2))  # PPO's pessimistic surrogate (L^CLIP)
vf_loss = U.mean(tf.square(pi.vpred - ret))
total_loss = pol_surr + pol_entpen + vf_loss

losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

var_list = pi.get_trainable_variables()
lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                         losses + [U.flatgrad(total_loss, var_list)])
adam = MpiAdam(var_list, epsilon=adam_epsilon)

assign_old_eq_new = U.function(
    [], [],
Exemple #15
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        seed=1  # annealing for stepsize parameters (epsilon and adam)
):

    # We want to log:
    num_options = 1  # Hacky solution -> enables to use same logging!
    epoch = -1
    saves = True
    wsaves = True

    ### Book-keeping
    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    #gamename += app
    version_name = 'officialPPO'

    dirname = '{}_{}_{}opts_saves/'.format(version_name, gamename, num_options)

    # retrieve everything using relative paths. Create a train_results folder where the repo has been cloned
    dirname_rel = os.path.dirname(__file__)
    splitted = dirname_rel.split("/")
    dirname_rel = ("/".join(dirname_rel.split("/")[:len(splitted) - 3]) + "/")
    dirname = dirname_rel + "train_results/" + dirname

    # Specify the paths where results shall be written to:
    src_code_path = dirname_rel + 'baselines/baselines/ppo1/'
    results_path = dirname_rel
    envs_path = dirname_rel + 'nfunk/envs_nf/'

    print(dirname)
    #input ("wait here after dirname")

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py']
        first = True
        for i in range(len(files)):
            src = os.path.join(src_code_path) + files[i]
            print(src)
            #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname
            dest = dirname + "src_code/"
            if (first):
                os.makedirs(dest)
                first = False
            print(dest)
            shutil.copy2(src, dest)
        # brute force copy normal env file at end of copying process:
        env_files = ['pendulum_nf.py']
        for i in range(len(env_files)):
            src = os.path.join(envs_path + env_files[i])
            shutil.copy2(src, dest)
        os.makedirs(dest + "assets/")
        src = os.path.join(envs_path + "assets/clockwise.png")
        shutil.copy2(src, dest + "assets/")

    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.seed(seed)

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    saver = tf.train.Saver(max_to_keep=10000)
    saver_best = tf.train.Saver(max_to_keep=1)

    ### More book-kepping
    results = []
    if saves:
        results = open(
            dirname + version_name + '_' + gamename + '_' + str(num_options) +
            'opts_' + '_results.csv', 'w')
        results_best_model = open(
            dirname + version_name + '_' + gamename + '_' + str(num_options) +
            'opts_' + '_bestmodel.csv', 'w')

        out = 'epoch,avg_reward'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)
        out += '\n'
        results.write(out)
        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch >= 0:

        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    max_val = -100000
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values

        if iters_so_far % 50 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        if (np.mean(rewbuffer) > max_val) and wsaves:
            max_val = np.mean(rewbuffer)
            results_best_model.write('epoch: ' + str(iters_so_far) + 'rew: ' +
                                     str(np.mean(rewbuffer)) + '\n')
            results_best_model.flush()
            filename = dirname + 'best.ckpt'.format(gamename, iters_so_far)
            save_path = saver_best.save(U.get_session(), filename)

        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            #for _ in range(num_options): #out+=",{},{},{},{}"
            out += "\n"

            info = [iters_so_far, np.mean(rewbuffer)]

            results.write(out.format(*info))
            results.flush()
Exemple #16
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        num_options=1,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0,
        render=False,
        caption='',
        deoc=False,
        tradeoff=0.1,
        term_mult=1.0,
        lr_mult=1.0,
        tdeoc=False):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.seed(seed)

    ### Book-keeping
    if hasattr(env, 'NAME'):
        gamename = env.NAME.lower()
    else:
        gamename = env.spec.id[:-3].lower()

    gamename += '-seed' + str(seed)
    gamename += app

    dirname = '{}_{}opts_saves/'.format(gamename, num_options)

    ### More book-kepping
    results = []
    if tdeoc:
        results_name = caption + 'TDEOC_' + gamename + '_tradeoff' + str(
            tradeoff) + '_dc' + str(dc)
    elif deoc:
        results_name = caption + 'DEOC_' + gamename + '_tradeoff' + str(
            tradeoff) + '_dc' + str(dc)
    else:
        results_name = caption + 'Vanilla_' + gamename + '_dc' + str(dc)

    if epoch >= 0:
        results_name_file = results_name + '_epoch' + str(epoch)
        results_name_file += '_term_mult' + str(term_mult) + '_lr_mult' + str(
            lr_mult) + '_' + str(num_options) + 'opts' + '_results.csv'

    results_name += '_term_mult' + str(term_mult) + '_lr_mult' + str(
        lr_mult) + '_' + str(num_options) + 'opts' + '_results.csv'

    if (epoch < 0 and os.path.exists(results_name)) or (
            epoch >= 0 and os.path.exists(results_name_file)):
        print("Run already saved")
        sys.exit()
    if saves:
        if epoch >= 0:
            print(results_name_file)
            results = open(results_name_file, 'w')
        else:
            print(results_name)
            results = open(results_name, 'w')

        out = 'epoch,avg_reward,avg_entropy,switches'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} steps'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)
        out += '\n'
        results.write(out)
        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if wsaves:
        first = True
        if not os.path.exists(results_name + '_weights'):
            os.makedirs(results_name + '_weights')
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        # files = ['pposgd_simple.py','mlp_policy.py','run_mujoco.py']
        # for i in range(len(files)):
        #     src = os.path.expanduser('') + files[i]
        #     dest = os.path.expanduser('') + results_name+'_weights'
        #     shutil.copy2(src,dest)
    ###

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    atarg_ent = tf.placeholder(dtype=tf.float32, shape=[
        None
    ])  # Target advantage with pseudo reward function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    ret_ent = tf.placeholder(dtype=tf.float32,
                             shape=[None])  # Empirical return Entropy
    diversity = tf.placeholder(dtype=tf.float32, shape=[None])

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # pdb.set_trace()
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * (atarg_ent if deoc else atarg
                     )  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * (
        atarg_ent if deoc else atarg)  #
    pol_surr = -U.mean(tf.minimum(
        surr1,
        surr2))  # PPO's pessimistic surrogate (L^CLIP)  (Intra Option update)

    vf_loss = U.mean(tf.square(pi.vpred - ret))  ## Critic (Option critic)

    if deoc:
        vf_loss_ent = U.mean(tf.square(pi.vpred_ent - ret_ent))
        total_loss = pol_surr + pol_entpen + vf_loss + vf_loss_ent
        losses = [pol_surr, pol_entpen, vf_loss, vf_loss_ent, meankl, meanent]
        loss_names = [
            "pol_surr", "pol_entpen", "vf_loss", "vf_ent_loss", "kl", "ent"
        ]
    else:
        total_loss = pol_surr + pol_entpen + vf_loss
        losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
        loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    if tdeoc:
        term_loss = -pi.tpred * diversity  ## Termination loss fn
    else:
        term_loss = pi.tpred * term_adv

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)

    op_loss = -tf.reduce_sum(
        log_pi[0][option[0]] * atarg + entropy * 0.1)  #Policy over options

    total_loss += op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[8:10]

    lossandgrad = U.function(
        [ob, ac, atarg, atarg_ent, ret, ret_ent, lrmult, option],
        losses + [U.flatgrad(total_loss, var_list)])
    termloss = U.function(([ob, option, term_adv, diversity]
                           if tdeoc else [ob, option, term_adv]),
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.compat.v1.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function(
        [ob, ac, atarg, atarg_ent, ret, ret_ent, lrmult, option], losses)

    U.initialize()
    adam.sync()

    saver = tf.compat.v1.train.Saver(max_to_keep=10000)
    dirname = results_name + '_weights/'

    if epoch >= 0:

        dirname = results_name + '_weights/'
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}.ckpt'.format(epoch)
        saver.restore(U.get_session(), filename)
    ###

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    prewbuffer = deque(maxlen=100)  # rolling buffer for episode pseudo rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam, deoc=deoc, tradeoff=tradeoff)

        opt_d = [0 for _ in range(num_options)]
        opt_steps = [
        ]  # Mean number of steps taken by an option before termination
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_steps.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(
                seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))
        print("mean opt dur:", opt_steps)
        print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']), axis=0))

        ob, ac, opts, atarg, atarg_ent, tdlamret, tdlamret_ent, diversity = seg[
            "ob"], seg["ac"], seg["opts"], seg["adv"], seg["adv_ent"], seg[
                "tdlamret"], seg["tdlamret_ent"], seg["prew"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        diversity = (diversity - diversity.mean()) / diversity.std()
        diversity = diversity * 1e1  # if env.spec.id[:-3].lower()[:9] == "miniworld" else diversity*1e1
        atarg_ent = (atarg_ent if not deoc else
                     (atarg_ent - atarg_ent.mean()) / atarg_ent.std())
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if iters_so_far % 5 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '.ckpt'
            save_path = saver.save(U.get_session(), filename)
            if iters_so_far % 500 == 0:
                filename = dirname + '{}.ckpt'.format(iters_so_far)
                save_path = saver.save(U.get_session(), filename)

        min_batch = (160 if num_options < 3 else 200)  # Arbitrary
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              atarg_ent=atarg_ent[indices],
                                              vtarg=tdlamret[indices],
                                              vtarg_ent=tdlamret_ent[indices],
                                              diversity=diversity[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_atarg_ent = np.concatenate(
                        (oldmap['atarg_ent'], atarg_ent[indices]))
                    cat_diversity = np.concatenate(
                        (oldmap['diversity'], diversity[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    cat_vtarg_ent = np.concatenate(
                        (oldmap['vtarg_ent'], tdlamret_ent[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              atarg_ent=cat_atarg_ent,
                                              vtarg=cat_vtarg,
                                              vtarg_ent=cat_vtarg_ent,
                                              diversity=cat_diversity),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n
                      < min_batch) or (indices.size > min_batch
                                       and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_atarg_ent = np.concatenate(
                        (oldmap['atarg_ent'], atarg_ent[indices]))
                    cat_diversity = np.concatenate(
                        (oldmap['diversity'], diversity[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    cat_vtarg_ent = np.concatenate(
                        (oldmap['vtarg_ent'], tdlamret_ent[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  atarg_ent=cat_atarg_ent,
                                                  vtarg=cat_vtarg,
                                                  vtarg_ent=cat_vtarg_ent,
                                                  diversity=cat_diversity),
                                             shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(
                        ob=ob[indices],
                        ac=ac[indices],
                        atarg=atarg[indices],
                        atarg_ent=atarg_ent[indices],
                        vtarg=tdlamret[indices],
                        vtarg_ent=tdlamret_ent[indices],
                        diversity=diversity[indices]),
                                             shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              atarg_ent=atarg_ent[indices],
                                              vtarg=tdlamret[indices],
                                              vtarg_ent=tdlamret_ent[indices],
                                              diversity=diversity[indices]),
                                         shuffle=not pi.recurrent)
            ###

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    tadv, nodc_adv = pi.get_term_adv(batch["ob"], [opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["atarg_ent"],
                                                    batch["vtarg"],
                                                    batch["vtarg_ent"],
                                                    cur_lrmult, [opt])
                    termg = termloss(
                        batch["ob"], [opt], tadv,
                        batch["diversity"]) if tdeoc else termloss(
                            batch["ob"], [opt], tadv)
                    adam.update(termg[0], term_mult * 5e-7 * cur_lrmult)
                    adam.update(grads, lr_mult * optim_stepsize * cur_lrmult)
                    losses.append(newlosses)

        # Record 3d simulations
        if iters_so_far % 50 == 0 and render:
            record_behavior(env,
                            pi,
                            iteration=iters_so_far,
                            stochastic=True,
                            num_opts=num_options,
                            frames=2052,
                            dirname=results_name)

        # Record Trajectories with option distinction
        if iters_so_far % 5 == 0 and render:
            if hasattr(env, 'NAME'):
                record_tmaze(env,
                             pi,
                             iteration=iters_so_far,
                             stochastic=True,
                             num_opts=num_options,
                             frames=2052,
                             dirname=results_name,
                             epoch=epoch)
            elif env.spec.id[:-3].lower(
            ) == "miniworld-oneroom" or env.spec.id[:-3].lower(
            ) == "miniworld-tmaze":
                record_oneroom(env,
                               pi,
                               iteration=iters_so_far,
                               stochastic=True,
                               num_opts=num_options,
                               frames=2052,
                               dirname=results_name,
                               epoch=epoch)

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ent_rets"],
                   seg["opt_switches"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, prew, switches = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        prewbuffer.extend(prew)
        logger.record_tabular("OptSwitches", np.mean(switches))
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EntropyMean", np.mean(prewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{},{},{}"
            for _ in range(num_options):
                out += ",{},{},{},{}"
            out += "\n"

            info = [
                iters_so_far,
                np.mean(rewbuffer),
                np.mean(prewbuffer),
                np.mean(seg["opt_switches"])
            ]
            for i in range(num_options):
                info.append(opt_d[i])
            for i in range(num_options):
                info.append(opt_steps[i])
            for i in range(num_options):
                info.append(std[i])
            for i in range(num_options):
                info.append(np.mean(np.array(seg['term_p']), axis=0)[i])
            for i in range(num_options):
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
Exemple #17
0
def learn(
    env,
    test_env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    rew_mean = []

    ob_space = env.observation_space
    pro_ac_space = env.action_space
    adv_ac_space = env.adv_action_space

    pro_pi = policy_func("pro_pi", ob_space,
                         pro_ac_space)  # Construct network for new policy
    pro_oldpi = policy_func("pro_oldpi", ob_space,
                            pro_ac_space)  # Network for old policy
    adv_pi = policy_func("adv_pi", ob_space,
                         adv_ac_space)  # Construct network for new adv policy
    adv_oldpi = policy_func("adv_oldpi", ob_space,
                            adv_ac_space)  # Network for old adv policy

    pro_atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    adv_atarg = tf.placeholder(dtype=tf.float32, shape=[None])
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    pro_ac = pro_pi.pdtype.sample_placeholder([None])
    adv_ac = adv_pi.pdtype.sample_placeholder([None])

    pro_kloldnew = pro_oldpi.pd.kl(pro_pi.pd)  # compute kl difference
    adv_kloldnew = adv_oldpi.pd.kl(adv_pi.pd)
    pro_ent = pro_pi.pd.entropy()
    adv_ent = adv_pi.pd.entropy()
    pro_meankl = U.mean(pro_kloldnew)
    adv_meankl = U.mean(adv_kloldnew)
    pro_meanent = U.mean(pro_ent)
    adv_meanent = U.mean(adv_ent)
    pro_pol_entpen = (-entcoeff) * pro_meanent
    adv_pol_entpen = (-entcoeff) * adv_meanent

    pro_ratio = tf.exp(pro_pi.pd.logp(pro_ac) -
                       pro_oldpi.pd.logp(pro_ac))  # pnew / pold
    adv_ratio = tf.exp(adv_pi.pd.logp(adv_ac) - adv_oldpi.pd.logp(adv_ac))
    pro_surr1 = pro_ratio * pro_atarg  # surrogate from conservative policy iteration
    adv_surr1 = adv_ratio * adv_atarg
    pro_surr2 = U.clip(pro_ratio, 1.0 - clip_param,
                       1.0 + clip_param) * pro_atarg  #
    adv_surr2 = U.clip(adv_ratio, 1.0 - clip_param,
                       1.0 + clip_param) * adv_atarg
    pro_pol_surr = -U.mean(tf.minimum(
        pro_surr1, pro_surr2))  # PPO's pessimistic surrogate (L^CLIP)
    adv_pol_surr = -U.mean(tf.minimum(adv_surr1, adv_surr2))
    pro_vf_loss = U.mean(tf.square(pro_pi.vpred - ret))
    adv_vf_loss = U.mean(tf.square(adv_pi.vpred - ret))
    pro_total_loss = pro_pol_surr + pro_pol_entpen + pro_vf_loss
    adv_total_loss = adv_pol_surr + adv_pol_entpen + adv_vf_loss
    pro_losses = [
        pro_pol_surr, pro_pol_entpen, pro_vf_loss, pro_meankl, pro_meanent
    ]
    pro_loss_names = [
        "pro_pol_surr", "pro_pol_entpen", "pro_vf_loss", "pro_kl", "pro_ent"
    ]
    adv_losses = [
        adv_pol_surr, adv_pol_entpen, adv_vf_loss, adv_meankl, adv_meanent
    ]
    adv_loss_names = [
        "adv_pol_surr", "adv_pol_entpen", "adv_vf_loss", "adv_kl", "adv_ent"
    ]

    pro_var_list = pro_pi.get_trainable_variables()
    adv_var_list = adv_pi.get_trainable_variables()
    pro_lossandgrad = U.function([ob, pro_ac, pro_atarg, ret, lrmult],
                                 pro_losses +
                                 [U.flatgrad(pro_total_loss, pro_var_list)])
    adv_lossandgrad = U.function([ob, adv_ac, adv_atarg, ret, lrmult],
                                 adv_losses +
                                 [U.flatgrad(adv_total_loss, adv_var_list)])
    pro_adam = MpiAdam(pro_var_list, epsilon=adam_epsilon)
    adv_adam = MpiAdam(adv_var_list, epsilon=adam_epsilon)

    pro_assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                pro_oldpi.get_variables(), pro_pi.get_variables())
        ])
    adv_assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                adv_oldpi.get_variables(), adv_pi.get_variables())
        ])
    pro_compute_losses = U.function([ob, pro_ac, pro_atarg, ret, lrmult],
                                    pro_losses)
    adv_compute_losses = U.function([ob, adv_ac, adv_atarg, ret, lrmult],
                                    adv_losses)

    U.initialize()
    pro_adam.sync()
    adv_adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pro_pi,
                                     adv_pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, pro_ac, adv_ac, pro_atarg, adv_atarg, pro_tdlamret, adv_tdlamret = seg[
            "ob"], seg["pro_ac"], seg["adv_ac"], seg["pro_adv"], seg[
                "adv_adv"], seg["pro_tdlamret"], seg["adv_tdlamret"]
        pro_vpredbefore = seg[
            "pro_vpred"]  # predicted value function before udpate
        adv_vpredbefore = seg["adv_vpred"]
        pro_atarg = (pro_atarg - pro_atarg.mean()) / pro_atarg.std(
        )  # standardized advantage function estimate
        adv_atarg = (adv_atarg - adv_atarg.mean()) / adv_atarg.std()

        # TODO
        d = Dataset(dict(ob=ob, ac=pro_ac, atarg=pro_atarg,
                         vtarg=pro_tdlamret),
                    shuffle=not pro_pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pro_pi, "ob_rms"):
            pro_pi.ob_rms.update(ob)  # update running mean/std for policy

        pro_assign_old_eq_new(
        )  # set old parameter values to new parameter values

        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            pro_losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = pro_lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)
                pro_adam.update(g, optim_stepsize * cur_lrmult)
                pro_losses.append(newlosses)

        pro_losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = pro_compute_losses(batch["ob"], batch["ac"],
                                           batch["atarg"], batch["vtarg"],
                                           cur_lrmult)
            pro_losses.append(newlosses)
        pro_meanlosses, _, _ = mpi_moments(pro_losses, axis=0)

        d = Dataset(dict(ob=ob, ac=adv_ac, atarg=adv_atarg,
                         vtarg=adv_tdlamret),
                    shuffle=not adv_pi.recurrent)
        if hasattr(adv_pi, "ob_rms"): adv_pi.ob_rms.update(ob)
        adv_assign_old_eq_new()

        # logger.log(fmt_row(13, adv_loss_names))
        for _ in range(optim_epochs):
            adv_losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = adv_lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)
                adv_adam.update(g, optim_stepsize * cur_lrmult)
                adv_losses.append(newlosses)

        adv_losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = adv_compute_losses(batch["ob"], batch["ac"],
                                           batch["atarg"], batch["vtarg"],
                                           cur_lrmult)
            adv_losses.append(newlosses)
        adv_meanlosses, _, _ = mpi_moments(adv_losses, axis=0)
        curr_rew = evaluate(pro_pi, test_env)
        rew_mean.append(curr_rew)
        print(curr_rew)

        # logger.record_tabular("ev_tdlam_before", explained_variance(pro_vpredbefore, pro_tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

    return np.array(rew_mean)
    def learn(self):
        """Adapted from baselines/ppo1/pposgd_simple.py"""
        env = self._env
        with self._session as sess:
            ob_space = env.observation_space
            ac_space = env.action_space
            # Construct network for new policy
            pi = policy_fn("pi", ob_space, ac_space)
            # Network for old policy
            oldpi = policy_fn("oldpi", ob_space, ac_space)
            # Target advantage function (if applicable)
            atarg = tf.placeholder(dtype=tf.float32, shape=[None])
            # Empirical return
            ret = tf.placeholder(dtype=tf.float32, shape=[None])
            # learning rate multiplier, updated with schedule
            lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])
            # Annealed cliping parameter epislon
            clip_param = self._clip_param * lrmult

            ob = tf_util.get_placeholder_cached(name="ob")
            ac = pi.pdtype.sample_placeholder([None])

            kloldnew = oldpi.pd.kl(pi.pd)
            ent = pi.pd.entropy()
            meankl = tf_util.mean(kloldnew)
            meanent = tf_util.mean(ent)
            pol_entpen = (-self._entcoeff) * meanent

            # pnew / pold
            ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))
            # surrogate from conservative policy iteration
            surr1 = ratio * atarg
            surr2 = tf_util.clip(ratio, 1.0 - clip_param,
                                 1.0 + clip_param) * atarg
            # PPO's pessimistic surrogate (L^CLIP)
            pol_surr = -tf_util.mean(tf.minimum(surr1, surr2))
            vf_loss = tf_util.mean(tf.square(pi.vpred - ret))
            total_loss = pol_surr + pol_entpen + vf_loss
            losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
            loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

            var_list = pi.get_trainable_variables()
            lossandgrad = tf_util.function(
                [ob, ac, atarg, ret, lrmult],
                losses + [tf_util.flatgrad(total_loss, var_list)])
            adam = MpiAdam(var_list, epsilon=self._adam_epsilon)

            assign_old_eq_new = tf_util.function(
                [], [],
                updates=[
                    tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                        oldpi.get_variables(), pi.get_variables())
                ])
            compute_losses = tf_util.function([ob, ac, atarg, ret, lrmult],
                                              losses)

            tf_util.initialize()
            adam.sync()

            # Prepare for rollouts
            # ----------------------------------------
            seg_gen = traj_segment_generator(
                pi,
                env,
                self._timesteps_per_batch,
                stochastic=True,
                rw_sampling_args=self._random_walk_sampling_args)

            if self._load_model and self._save_path is not None:
                state = load_baselines_model(self._save_path)

                episodes_so_far = state.get("num_episodes", 0)
                timesteps_so_far = state.get("num_timesteps", 0)
                iters_so_far = state.get("num_iters", 0)
            else:
                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0

            tstart = time.time()

            lenbuffer = deque(maxlen=100)
            rewbuffer = deque(maxlen=100)
            errorbuffer = deque(maxlen=100)
            rwerrorbuffer = deque(maxlen=100)
            errordiffbuffer = deque(maxlen=100)

            time_constraint_satisfied = (sum([
                self._max_iters > 0, self._max_timesteps > 0,
                self._max_episodes > 0, self._max_seconds > 0
            ]) == 1)
            assert time_constraint_satisfied, "Only one time constraint permitted"
            while True:
                if self._callback: self_callback(locals(), globals())

                end = ((self._max_timesteps
                        and timesteps_so_far >= self._max_timesteps)
                       or (self._max_episodes
                           and episodes_so_far >= self._max_episodes)
                       or (self._max_iters and iters_so_far >= self._max_iters)
                       or (self._max_seconds
                           and time.time() - tstart >= self._max_seconds))

                if end:
                    save_baselines_model(self._save_path,
                                         {'num_episodes': episodes_so_far})
                    break
                elif (self._save_path is not None and self._save_freq > 0
                      and episodes_so_far % self._save_freq == 0):
                    save_baselines_model(self._save_path,
                                         {'num_episodes': episodes_so_far})

                if self._schedule == 'constant':
                    cur_lrmult = 1.0
                elif self._schedule == 'linear':
                    cur_lrmult = max(
                        1.0 - float(timesteps_so_far) / self._max_timesteps, 0)
                else:
                    raise NotImplementedError

                logger.log("********** Iteration %i ************" %
                           iters_so_far)

                seg = seg_gen.__next__()
                add_vtarg_and_adv(seg, self._gamma, self._lambda)

                ob, ac, atarg, tdlamret = (seg["ob"], seg["ac"], seg["adv"],
                                           seg["tdlamret"])
                # predicted value function before udpate
                vpredbefore = seg["vpred"]
                # standardized advantage function estimate
                atarg = (atarg - atarg.mean()) / atarg.std()
                d = Dataset(
                    {
                        'ob': ob,
                        'ac': ac,
                        'atarg': atarg,
                        'vtarg': tdlamret
                    },
                    shuffle=not pi.recurrent)
                optim_batchsize = self._optim_batchsize or ob.shape[0]
                # update running mean/std for policy
                if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)

                # set old parameter values to new parameter values
                assign_old_eq_new()
                logger.log("Optimizing...")
                logger.log(fmt_row(13, loss_names))
                # Here we do a bunch of optimization epochs over the data
                for _ in range(self._optim_epochs):
                    # list of tuples, each of which gives the loss for a minibatch
                    losses = []
                    for batch in d.iterate_once(optim_batchsize):
                        *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult)
                        adam.update(g, self._optim_stepsize * cur_lrmult)
                        losses.append(newlosses)
                    logger.log(fmt_row(13, np.mean(losses, axis=0)))

                logger.log("Evaluating losses...")
                losses = []
                for batch in d.iterate_once(optim_batchsize):
                    newlosses = compute_losses(batch["ob"], batch["ac"],
                                               batch["atarg"], batch["vtarg"],
                                               cur_lrmult)
                    losses.append(newlosses)
                meanlosses, _, _ = mpi_moments(losses, axis=0)

                logger.log(fmt_row(13, meanlosses))
                for (lossval, name) in zipsame(meanlosses, loss_names):
                    logger.record_tabular("loss_" + name, lossval)
                logger.record_tabular(
                    "ev_tdlam_before",
                    explained_variance(vpredbefore, tdlamret))
                lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
                listoflrpairs = MPI.COMM_WORLD.allgather(
                    lrlocal)  # list of tuples
                lens, rews = map(flatten_lists, zip(*listoflrpairs))
                lenbuffer.extend(lens)
                rewbuffer.extend(rews)

                # handle graph errors
                errors = seg["ep_errors"]
                rw_errors = seg["ep_rw_errors"]
                error_diffs = seg["ep_error_diffs"]
                listoferrors = MPI.COMM_WORLD.allgather(errors)
                listofrwerrors = MPI.COMM_WORLD.allgather(rw_errors)
                listoferrordiffs = MPI.COMM_WORLD.allgather(error_diffs)
                errors = flatten_lists(listoferrors)
                rwerrors = flatten_lists(listofrwerrors)
                errordiffs = flatten_lists(listoferrordiffs)
                errorbuffer.extend(errors)
                rwerrorbuffer.extend(rwerrors)
                errordiffbuffer.extend(errordiffs)

                logger.record_tabular("EpErrorMean", np.mean(errorbuffer))
                logger.record_tabular("EpRwErrorMean", np.mean(rwerrorbuffer))
                logger.record_tabular("EpErrorDiffMean",
                                      np.mean(errordiffbuffer))
                logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                logger.record_tabular("EpThisIter", len(lens))
                episodes_so_far += len(lens)
                timesteps_so_far += sum(lens)
                iters_so_far += 1
                logger.record_tabular("EpisodesSoFar", episodes_so_far)
                logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                logger.record_tabular("TimeElapsed", time.time() - tstart)
                if MPI.COMM_WORLD.Get_rank() == 0:
                    logger.dump_tabular()
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    load_model_path,
    test_only,
    stochastic,
    symmetric_training=False,
    obs_names=None,
    single_episode=False,
    horizon_hack=False,
    running_avg_len=100,
    init_three=False,
    actions=None,
    symmetric_training_trick=False,
    seeds_fn=None,
    bootstrap_seeds=False,
):
    global seeds
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)  # Network for new policy
    old_pi = policy_func("old_pi", ob_space,
                         ac_space)  # Network for old policy
    adv_targ = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    mask = tf.placeholder(dtype=tf.bool, shape=[None])  # Mask for the trick

    lr_mult = tf.placeholder(
        name='lr_mult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lr_mult  # Annealed clipping parameter epsilon

    ob = U.get_placeholder_cached(name="ob")
    st = U.get_placeholder_cached(name="st")
    ac = pi.pdtype.sample_placeholder([None])

    kl = old_pi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    mean_kl = U.mean(tf.boolean_mask(kl, mask))  # Mean over the batch
    mean_ent = U.mean(tf.boolean_mask(ent, mask))
    entropy_penalty = -entcoeff * mean_ent

    ratio = tf.exp(pi.pd.logp(ac) - old_pi.pd.logp(ac))  # pi_new / pi_old
    surr_1 = ratio * adv_targ  # surrogate from conservative policy iteration
    surr_2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * adv_targ  #
    surr_loss = -U.mean(tf.boolean_mask(
        tf.minimum(surr_1, surr_2),
        mask))  # PPO's pessimistic surrogate (L^CLIP), mean over the batch
    vf_loss = U.mean(tf.boolean_mask(tf.square(pi.vpred - ret), mask))
    total_loss = surr_loss + entropy_penalty + vf_loss
    losses = [surr_loss, entropy_penalty, vf_loss, mean_kl, mean_ent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    comp_loss_and_grad = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask],
                                    losses +
                                    [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(old_v, new_v)
            for (old_v,
                 new_v) in zipsame(old_pi.get_variables(), pi.get_variables())
        ])
    comp_loss = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask], losses)

    if init_three:
        assign_init_three_1 = U.function(
            [], [],
            updates=[
                tf.assign(new_v, old_v) for (old_v, new_v) in zipsame(
                    pi.get_orig_variables(), pi.get_part_variables(1))
            ])
        assign_init_three_2 = U.function(
            [], [],
            updates=[
                tf.assign(new_v, old_v) for (old_v, new_v) in zipsame(
                    pi.get_orig_variables(), pi.get_part_variables(2))
            ])

    U.initialize()
    if load_model_path is not None:
        U.load_state(load_model_path)
        if init_three:
            assign_init_three_1()
            assign_init_three_2()
    adam.sync()

    if seeds_fn is not None:
        with open(seeds_fn) as f:
            seeds = [int(seed) for seed in f.readlines()]
    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=stochastic,
                                     single_episode=test_only
                                     or single_episode,
                                     actions=actions,
                                     bootstrap_seeds=bootstrap_seeds)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    len_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for episode lengths
    rew_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for episode rewards
    origrew_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for original episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam, horizon_hack=horizon_hack)

        # ob, ac, adv_targ, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, st, ac, adv_targ, tdlamret = seg["ob"], seg["step"], seg[
            "ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        if symmetric_training_trick:
            first_75 = st < 75
            mask = ~np.concatenate((np.zeros_like(first_75), first_75))
        else:
            mask = np.concatenate(
                (np.ones_like(st,
                              dtype=np.bool), np.ones_like(st, dtype=np.bool)))
        if symmetric_training:
            sym_obss = []
            sym_acc = []
            for i in range(timesteps_per_batch):
                obs = OrderedDict(zip(obs_names, ob[i]))
                sym_obs = obs.copy()
                swap_legs(sym_obs)

                sym_ac = ac[i].copy()
                sym_ac = np.concatenate((sym_ac[9:], sym_ac[:9]))
                sym_obss.append(np.asarray(list(sym_obs.values())))
                sym_acc.append(sym_ac)
            sym_obss = np.asarray(sym_obss)
            sym_acc = np.asarray(sym_acc)

            ob = np.concatenate((ob, sym_obss))
            ac = np.concatenate((ac, sym_acc))
            adv_targ = np.concatenate((adv_targ, adv_targ))
            tdlamret = np.concatenate((tdlamret, tdlamret))
            vpredbefore = np.concatenate((vpredbefore, vpredbefore))
            st = np.concatenate((st, st))

        # Compute stats before updating
        if bootstrap_seeds:
            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"],
                       seg["easy_seeds"], seg["hard_seeds"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews, orig_rews, easy_seeds, hard_seeds = map(
                flatten_lists, zip(*listoflrpairs))
            easy_seeds = [x for x in easy_seeds if x != 0]
            hard_seeds = [x for x in hard_seeds if x != 0]
            print('seeds set sizes:', len(seeds), len(easy_seeds),
                  len(hard_seeds))
            seeds = list((set(seeds) - set(easy_seeds)) | set(hard_seeds))
        else:
            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"]
                       )  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews, orig_rews = map(flatten_lists, zip(*listoflrpairs))

        len_buffer.extend(lens)
        rew_buffer.extend(rews)
        origrew_buffer.extend(orig_rews)
        logger.record_tabular("Iter", iters_so_far)
        logger.record_tabular("EpLenMean", np.mean(len_buffer))
        logger.record_tabular("EpRewMean", np.mean(rew_buffer))
        logger.record_tabular("EpOrigRewMean", np.mean(origrew_buffer))
        logger.record_tabular("EpOrigRewStd", np.std(origrew_buffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        n_completed = 0
        sum_completed = 0
        for ep_len, orig_rew in zip(lens, orig_rews):
            if ep_len == 1000:
                n_completed += 1
                sum_completed += orig_rew
        avg_completed = sum_completed / n_completed if n_completed > 0 else 0
        logger.record_tabular("AvgCompleted", avg_completed)
        perc_completed = 100 * n_completed / len(lens) if len(lens) > 0 else 0
        logger.record_tabular("PercCompleted", perc_completed)

        if callback: callback(locals(), globals())

        adv_targ = (adv_targ - adv_targ.mean()) / adv_targ.std(
        )  # standardized advantage function estimate
        d = Dataset(dict(ob=ob,
                         st=st,
                         ac=ac,
                         atarg=adv_targ,
                         vtarg=tdlamret,
                         mask=mask),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        if not test_only:
            logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data. I log results only for the first worker (rank=0)
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *batch_losses, grads = comp_loss_and_grad(
                    batch["ob"], batch["st"], batch["ac"], batch["atarg"],
                    batch["vtarg"], cur_lrmult, batch["mask"])
                if not test_only:
                    adam.update(grads, optim_stepsize * cur_lrmult)
                losses.append(batch_losses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            batch_losses = comp_loss(batch["ob"], batch["st"], batch["ac"],
                                     batch["atarg"], batch["vtarg"],
                                     cur_lrmult, batch["mask"])
            losses.append(batch_losses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        iters_so_far += 1
Exemple #20
0
def learn(env, policy_func, *,
        timesteps_per_batch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant' # annealing for stepsize parameters (epsilon and adam)
        ):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult) 
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)            
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()
    def __init__(self, env, policy, 
                 emb_network, emb_size,
                 clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
                 optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
                 gamma, lam, # advantage estimation
                 max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
                 adam_epsilon=1e-5,
                 schedule='constant',
                 joint_training=False
                 ):
        # Setup variables
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.gamma = gamma
        self.lam = lam
        self.max_timesteps = max_timesteps
        self.adam_epsilon = adam_epsilon
        self.schedule = schedule

        # Setup losses and stuff
        # ----------------------------------------
        with tf.name_scope('ppo'):
            ob_space = env.observation_space
            ac_space = env.action_space
            self.pi = policy # Construct network for new policy
            oldpi = Policy("old_policy", env.action_space, joint_training, emb_size, emb_network) # Network for old policy
            atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
            ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

            lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
            clip_param = clip_param * lrmult # Annealed cliping parameter epislon

            # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))
            if joint_training:
                ob = U.get_placeholder_cached(name="ob_f")
            else:
                ob = U.get_placeholder_cached(name="ob")
            ac = self.pi.pdtype.sample_placeholder([None])

            kloldnew = oldpi.pd.kl(self.pi.pd)
            ent = self.pi.pd.entropy()
            meankl = U.mean(kloldnew)
            meanent = U.mean(ent)
            pol_entpen = (-entcoeff) * meanent

            ratio = tf.exp(self.pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
            surr1 = ratio * atarg # surrogate from conservative policy iteration
            surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
            pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
            vf_loss = U.mean(tf.square(self.pi.vpred - ret))
            self.total_loss = pol_surr + pol_entpen + vf_loss
            losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
            self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

            var_list = self.pi.get_trainable_variables()
            self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(self.total_loss, var_list)])
            self.adam = MpiAdam(var_list, epsilon=adam_epsilon)

            self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(oldpi.get_variables(), self.pi.get_variables())])
            self.compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

            U.initialize()
            self.adam.sync()

        # Prepare for rollouts
        # ----------------------------------------
        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.iters_so_far = 0
        self.tstart = time.time()
        self.lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
        self.rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
Exemple #22
0
    def __init__(self, env, policy_func, *,
        timesteps_per_batch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=20, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-3,
        schedule='linear'): # annealing for stepsize parameters (epsilon and adam)):

        len1 = 2
        max_iters = max_iters
        ob_space = env.observation_space.spaces
        ac_space = env.action_space.spaces
        pi = [policy_func("pi" + str(i), ob_space[i], ac_space[i],placeholder_name="observation"+str(i)) for i in range(len1)]
        oldpi = [policy_func("oldpi" + str(i), ob_space[i], ac_space[i], placeholder_name="observation"+str(i)) for i in range(len1)]
        atarg = [tf.placeholder(dtype=tf.float32, shape=[None]) for i in range(len1)]
        ret = [tf.placeholder(dtype=tf.float32, shape=[None]) for i in range(len1)]
        tdlamret = [[] for i in range(len1)]
        # TODO: here I should revise lrmult to as it was before
        # lrmult = 1.0 # here for simple I only use constant learning rate multiplier
        lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
        clip_param = clip_param * lrmult

        #TODO: this point I cannot finally understand it, originally it is
        # ob=U.get_placeholder_cached(name="ob")
        #TODO: here it is a bug to fix, I think the get_placeholder_cached is global, you can only cache observation once and the next time if it finds the name placeholder it will return the previous placeholder, I don't know whether different namescope have  effect on this.
        # ob1 = U.get_placeholder_cached(name="observation1") # Note: I am not sure about this point
        # # ob2 = U.get_placeholder_cached(name="observation2")
        # ob1 = U.get_placeholder_cached(name="observation0")  # Note: I am not sure about this point
        # ob2 = U.get_placeholder_cached(name="observation1")
        #TODO: the only one question now is that pi network and oldpi networ both have the ob_ph named "observation", even in the original baseline implementation, does pi and oldpi share the observation placeholder, I think it is not

        ob = [U.get_placeholder_cached(name="observation"+str(i)) for i in range(len1)]
        # ac = tuple([pi[i].act(stochastic=True, observation=env.observation_space[i])[0]
        #      for i in range(len1)])
        # TODO: here for the policy to work I changed the observation parameter passed into the pi function to s which comes from env.reset()
        # s = env.reset()
        # ac = tuple([pi[i].act(stochastic=True, observation=s[i])[0]
        #             for i in range(len1)])

        ac = [pi[i].pdtype.sample_placeholder([None]) for i in range (len1)]
        kloldnew = [oldpi[i].pd.kl(pi[i].pd) for i in range(len1)]
        ent = [pi[i].pd.entropy() for i in range(len1)]
        print("ent1 and ent2 are {} and {}".format(ent[0], ent[1]))
        meankl = [U.mean(kloldnew[i]) for i in range(len1)]
        meanent = [U.mean(ent[i]) for i in range(len1)]

        pol_entpen = [(-entcoeff) * meanent[i] for i in range(len1)]
        ratio = [tf.exp(pi[i].pd.logp(ac[i]) - oldpi[i].pd.logp(ac[i])) for i in range(len1)]
        # ratio = [tf.exp(pi[i].pd.logp(ac) - oldpi[i].pd.logp(ac[i])) for i in range(len1)] #pnew / pold
        surr1 = [ratio[i] * atarg[i] for i in range(len1)]
        # U.clip = tf.clip_by_value(t, clip_value_min, clip_value_max,name=None):
        # # among which t is A 'Tensor' so
        surr2 = [U.clip(ratio[i], 1.0 - clip_param, 1.0 + clip_param) for i in range(len1)]
        pol_surr = [-U.mean(tf.minimum(surr1[i], surr2[i])) for i in range(len1)]
        vf_loss = [U.mean(tf.square(pi[i].vpred - ret[i])) for i in range(len1)]
        total_loss = [pol_surr[i] + pol_entpen[i] + vf_loss[i] for i in range(len1)]
        # here I ccome to realize that the following miscelleous losses are just operations not tensors so they should be
        # # be made to a list to contain the info of the two agents
        # surr2 = U.clip(ratio[i], 1.0 - clip_param, 1.0 + clip_param)
        # pol_surr = -U.mean(tf.minimum(surr1[i], surr2[i]))
        # vf_loss = U.mean(tf.square(pi[i].vpred - ret[i]))
        # total_loss = pol_surr + pol_entpen + vf_loss

        #TODO: in another way I choose to revise losses to following:
        losses = [[pol_surr[i], pol_entpen[i], vf_loss[i], meankl[i], meanent[i]] for i in range(len1)]
        loss_names = ["pol_sur", "pol_entpen","vf_loss", "kl", "ent"]
        var_list = [pi[i].get_trainable_variables() for i in range(len1)]

        lossandgrad = [U.function([ob[i], ac[i], atarg[i], ret[i], lrmult], losses[i] + [U.flatgrad(total_loss[i], var_list[i])]) for i in range(len1)]
        adam = [MpiAdam(var_list[i], epsilon=adam_epsilon) for i in range(2)]

        #TODO: I wonder this cannot function as expected because the result is a list of functions, not will not execute automatically
        # assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
        #     for (oldv, newv) in zipsame(oldpi[i].get_variables(), pi[i].get_variables())]) for i in range(len1)]

        # compute_losses is a function, so it should not be copied to copies, nevertheless the parameters should be
        # passed into it as the two agents
        compute_losses = [U.function([ob[i], ac[i], atarg[i], ret[i], lrmult], losses[i]) for i in range(len1)]
        # sess = U.get_session()
        # writer = tf.summary.FileWriter(logdir='log-mlp',graph=sess.graph)
        # now when the training iteration ends, save the trained model and test the win rate of the two.
        pi0_variables = slim.get_variables(scope="pi0")
        pi1_variables = slim.get_variables(scope="pi1")
        parameters_to_save_list0 = [v for v in pi0_variables]
        parameters_to_save_list1 = [v for v in pi1_variables]
        parameters_to_save_list = parameters_to_save_list0 + parameters_to_save_list1
        saver = tf.train.Saver(parameters_to_save_list)
        restore = tf.train.Saver(parameters_to_save_list)
        U.initialize()
        restore.restore(U.get_session(), "saveparameter/15/15.pkl")
        U.get_session().run
        # [adam[i].sync() for i in range(2)]
        adam[0].sync()
        adam[1].sync()
        # Prepare for rollouts
        # ----------------------------------------

        self.max_iters = max_iters
        episodes_so_far = 0
        timesteps_so_far = 0
        self.iters_so_far = 0
        tstart = time.time()

        lenbuffer = [deque(maxlen=100) for i in range(len1)]  # rolling buffer for episode lengths
        rewbuffer = [deque(maxlen=100) for i in range(len1)] # rolling buffer for episode rewards
        parameters_savers = []
        assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"
Exemple #23
0
    def __init__(self, a_name, env, policy_func, par):
        # Setup losses and stuff
        # ----------------------------------------
        self.env = env
        self.timesteps_per_actorbatch = par.timesteps_per_actorbatch
        self.optim_epochs = par.optim_epochs
        self.optim_stepsize = par.optim_stepsize
        self.optim_batchsize = par.optim_batchsize  # optimization hypers
        self.gamma = par.gamma
        self.lam = par.lam  # advantage estimation
        self.max_timesteps = par.max_timesteps
        self.max_episodes = par.max_episodes
        self.max_iters = par.max_iters
        self.max_seconds = par.max_seconds  # time constraint
        self.callback = par.callback,  # you can do anything in the callback, since it takes locals(), globals()
        self.adam_epsilon = par.adam_epsilon
        self.schedule = par.schedule  # annealing for stepsize parameters (epsilon and adam)

        self.ob_space = env.observation_space
        self.ac_space = env.action_space
        self.pi = policy_func(
            a_name, self.ob_space,
            self.ac_space)  # Construct network for new policy
        self.oldpi = policy_func("old" + a_name, self.ob_space,
                                 self.ac_space)  # Network for old policy
        self.atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        self.ret = tf.placeholder(dtype=tf.float32,
                                  shape=[None])  # Empirical return

        self.lrmult = tf.placeholder(
            name='lrmult' + a_name, dtype=tf.float32,
            shape=[])  # learning rate multiplier, updated with schedule
        self.clip_param = par.clip_param * self.lrmult  # Annealed cliping parameter epislon

        obname = str('ob' + str(learning_agent.index2))
        learning_agent.index2 += 1
        self.ob = U.get_placeholder_cached(name=obname)
        self.ac = self.pi.pdtype.sample_placeholder([None])

        self.kloldnew = self.oldpi.pd.kl(self.pi.pd)
        self.ent = self.pi.pd.entropy()
        self.meankl = U.mean(self.kloldnew)
        self.meanent = U.mean(self.ent)
        self.pol_entpen = (-par.entcoeff) * self.meanent

        self.ratio = tf.exp(
            self.pi.pd.logp(self.ac) -
            self.oldpi.pd.logp(self.ac))  # pnew / pold
        surr1 = self.ratio * self.atarg  # surrogate from conservative policy iteration
        surr2 = U.clip(self.ratio, 1.0 - self.clip_param,
                       1.0 + self.clip_param) * self.atarg  #
        self.pol_surr = -U.mean(tf.minimum(
            surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
        self.vf_loss = U.mean(tf.square(self.pi.vpred - self.ret))
        self.total_loss = self.pol_surr + self.pol_entpen + self.vf_loss
        self.losses = [
            self.pol_surr, self.pol_entpen, self.vf_loss, self.meankl,
            self.meanent
        ]
        self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

        self.var_list = self.pi.get_trainable_variables()
        self.lossandgrad = U.function(
            [self.ob, self.ac, self.atarg, self.ret, self.lrmult],
            self.losses + [U.flatgrad(self.total_loss, self.var_list)])
        self.adam = MpiAdam(self.var_list, epsilon=self.adam_epsilon)

        self.assign_old_eq_new = U.function(
            [], [],
            updates=[
                tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                    self.oldpi.get_variables(), self.pi.get_variables())
            ])
        self.compute_losses = U.function(
            [self.ob, self.ac, self.atarg, self.ret, self.lrmult], self.losses)

        print(U.get_session())
        U.initialize()

        self.adam.sync()
Exemple #24
0
    def _build_model(
            self,
            input_space,
            action_size,
            policy_func,
            clip_param=0.2,
            entcoeff=0.01,  # clipping parameter epsilon, entropy coeff
            adam_epsilon=1e-5):
        sess = U.get_session()
        if sess is None:
            sess = U.make_session(8)
            sess.__enter__()

        # Setup losses and stuff
        # ----------------------------------------
        with tf.variable_scope(self.scope):
            self.pi = policy_func(
                "pi", input_space,
                action_size)  # Construct network for new policy
            self.oldpi = policy_func("oldpi", input_space,
                                     action_size)  # Network for old policy
            atarg = tf.placeholder(
                dtype=tf.float32,
                shape=[None])  # Target advantage function (if applicable)
            ret = tf.placeholder(dtype=tf.float32,
                                 shape=[None])  # Empirical return

            lrmult = tf.placeholder(
                name='lrmult', dtype=tf.float32,
                shape=[])  # learning rate multiplier, updated with schedule
            clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

            ob = U.get_placeholder_cached(name="ob")
            ac = self.pi.pdtype.sample_placeholder([None])

            kloldnew = self.oldpi.pd.kl(self.pi.pd)
            ent = self.pi.pd.entropy()
            meankl = U.mean(kloldnew)
            meanent = U.mean(ent)
            pol_entpen = (-entcoeff) * meanent

            ratio = tf.exp(self.pi.pd.logp(ac) -
                           self.oldpi.pd.logp(ac))  # pnew / pold
            surr1 = ratio * atarg  # surrogate from conservative policy iteration
            surr2 = U.clip(ratio, 1.0 - clip_param,
                           1.0 + clip_param) * atarg  #
            pol_surr = -U.mean(tf.minimum(
                surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
            vf_loss = U.mean(tf.square(self.pi.vpred - ret))
            total_loss = pol_surr + pol_entpen + vf_loss

            var_list = self.pi.get_trainable_variables()

            # more debug info
            debug_atarg = atarg
            pi_ac = self.pi.pd.logp(ac)
            opi_ac = self.oldpi.pd.logp(ac)
            vpred = U.mean(self.pi.vpred)
            pi_pd = U.mean(self.pi.pd.flatparam())
            opi_pd = self.oldpi.pd.flatparam()[0]
            kl_oldnew = kloldnew[0]
            grads = tf.gradients(total_loss, var_list)

            losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
            debugs = [
                debug_atarg, pi_ac, opi_ac, vpred, pi_pd, opi_pd, kl_oldnew,
                total_loss
            ]

            self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                                          losses + debugs + [var_list, grads] +
                                          [U.flatgrad(total_loss, var_list)])
            self.adam = MpiAdam(var_list, epsilon=adam_epsilon)

            self.assign_old_eq_new = U.function(
                [], [],
                updates=[
                    tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                        self.oldpi.get_variables(), self.pi.get_variables())
                ])
            self.compute_losses = U.function([ob, ac, atarg, ret, lrmult],
                                             losses)

            U.initialize()
            self.adam.sync()
Exemple #25
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        sym_loss_weight=0.0,
        return_threshold=None,  # termiante learning if reaches return_threshold
        op_after_init=None,
        init_policy_params=None,
        policy_scope=None,
        max_threshold=None,
        positive_rew_enforce=False,
        reward_drop_bound=None,
        min_iters=0,
        ref_policy_params=None,
        rollout_length_thershold=None):

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    if policy_scope is None:
        pi = policy_func("pi", ob_space,
                         ac_space)  # Construct network for new policy
        oldpi = policy_func("oldpi", ob_space,
                            ac_space)  # Network for old policy
    else:
        pi = policy_func(policy_scope, ob_space,
                         ac_space)  # Construct network for new policy
        oldpi = policy_func("old" + policy_scope, ob_space,
                            ac_space)  # Network for old policy

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    sym_loss = sym_loss_weight * U.mean(
        tf.square(pi.mean - pi.mirrored_mean))  # mirror symmetric loss
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2)) + sym_loss  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent, sym_loss]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent", "sym_loss"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()

    if init_policy_params is not None:
        cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name.
                                               find('/')]
        orig_scope = list(init_policy_params.keys()
                          )[0][0:list(init_policy_params.keys())[0].find('/')]
        for i in range(len(pi.get_variables())):
            assign_op = pi.get_variables()[i].assign(
                init_policy_params[pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)
            assign_op = oldpi.get_variables()[i].assign(
                init_policy_params[pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)

    if ref_policy_params is not None:
        ref_pi = policy_func("ref_pi", ob_space, ac_space)
        cur_scope = ref_pi.get_variables()[0].name[0:ref_pi.get_variables()[0].
                                                   name.find('/')]
        orig_scope = list(ref_policy_params.keys()
                          )[0][0:list(ref_policy_params.keys())[0].find('/')]
        for i in range(len(ref_pi.get_variables())):
            assign_op = ref_pi.get_variables()[i].assign(
                ref_policy_params[ref_pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)
        env.env.env.ref_policy = ref_pi

    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    max_thres_satisfied = max_threshold is None
    adjust_ratio = 0.0
    prev_avg_rew = -1000000
    revert_parameters = {}
    variables = pi.get_variables()
    for i in range(len(variables)):
        cur_val = variables[i].eval()
        revert_parameters[variables[i].name] = cur_val
    revert_data = [0, 0, 0]
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        if reward_drop_bound is not None:
            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(flatten_lists, zip(*listoflrpairs))
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
            revert_iteration = False
            if np.mean(
                    rewbuffer
            ) < prev_avg_rew - reward_drop_bound:  # detect significant drop in performance, revert to previous iteration
                print("Revert Iteration!!!!!")
                revert_iteration = True
            else:
                prev_avg_rew = np.mean(rewbuffer)
            logger.record_tabular("Revert Rew", prev_avg_rew)
            if revert_iteration:  # revert iteration
                for i in range(len(pi.get_variables())):
                    assign_op = pi.get_variables()[i].assign(
                        revert_parameters[pi.get_variables()[i].name])
                    U.get_session().run(assign_op)
                episodes_so_far = revert_data[0]
                timesteps_so_far = revert_data[1]
                iters_so_far = revert_data[2]
                continue
            else:
                variables = pi.get_variables()
                for i in range(len(variables)):
                    cur_val = variables[i].eval()
                    revert_parameters[variables[i].name] = np.copy(cur_val)
                revert_data[0] = episodes_so_far
                revert_data[1] = timesteps_so_far
                revert_data[2] = iters_so_far

        if positive_rew_enforce:
            rewlocal = (seg["pos_rews"], seg["neg_pens"], seg["rew"]
                        )  # local values
            listofrews = MPI.COMM_WORLD.allgather(rewlocal)  # list of tuples
            pos_rews, neg_pens, rews = map(flatten_lists, zip(*listofrews))
            if np.mean(rews) < 0.0:
                #min_id = np.argmin(rews)
                #adjust_ratio = pos_rews[min_id]/np.abs(neg_pens[min_id])
                adjust_ratio = np.max([
                    adjust_ratio,
                    np.mean(pos_rews) / np.abs(np.mean(neg_pens))
                ])
                for i in range(len(seg["rew"])):
                    if np.abs(seg["rew"][i] - seg["pos_rews"][i] -
                              seg["neg_pens"][i]) > 1e-5:
                        print(seg["rew"][i], seg["pos_rews"][i],
                              seg["neg_pens"][i])
                        print('Reward wrong!')
                        abc
                    seg["rew"][i] = seg["pos_rews"][
                        i] + seg["neg_pens"][i] * adjust_ratio
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))
        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        if reward_drop_bound is None:
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        logger.record_tabular("Iter", iters_so_far)
        if positive_rew_enforce:
            if adjust_ratio is not None:
                logger.record_tabular("RewardAdjustRatio", adjust_ratio)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        if max_threshold is not None:
            print('Current max return: ', np.max(rewbuffer))
            if np.max(rewbuffer) > max_threshold:
                max_thres_satisfied = True
            else:
                max_thres_satisfied = False

        return_threshold_satisfied = True
        if return_threshold is not None:
            if not (np.mean(rewbuffer) > return_threshold
                    and iters_so_far > min_iters):
                return_threshold_satisfied = False
        rollout_length_thershold_satisfied = True
        if rollout_length_thershold is not None:
            rewlocal = (seg["avg_vels"], seg["rew"])  # local values
            listofrews = MPI.COMM_WORLD.allgather(rewlocal)  # list of tuples
            avg_vels, rews = map(flatten_lists, zip(*listofrews))
            if not (np.mean(lenbuffer) > rollout_length_thershold
                    and np.mean(avg_vels) > 0.5 * env.env.env.final_tv):
                rollout_length_thershold_satisfied = False
        if rollout_length_thershold is not None or return_threshold is not None:
            if rollout_length_thershold_satisfied and return_threshold_satisfied:
                break

    return pi, np.mean(rewbuffer)
Exemple #26
0
def compete_learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=20,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-3,
        schedule='linear'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    # at this stage, the ob_space and reward_space is
    #TODO: all of this tuples are not right ? becuase items in tuple is not mutable
    #TODO: another way to store the two agents' states is to use with with tf.variable_scope(scope, reuse=reuse):
    len1 = 2
    ob_space = env.observation_space.spaces
    ac_space = env.action_space.spaces
    pi = [
        policy_func("pi" + str(i),
                    ob_space[i],
                    ac_space[i],
                    placeholder_name="observation" + str(i))
        for i in range(len1)
    ]
    oldpi = [
        policy_func("oldpi" + str(i),
                    ob_space[i],
                    ac_space[i],
                    placeholder_name="observation" + str(i))
        for i in range(len1)
    ]
    atarg = [
        tf.placeholder(dtype=tf.float32, shape=[None]) for i in range(len1)
    ]
    ret = [tf.placeholder(dtype=tf.float32, shape=[None]) for i in range(len1)]
    tdlamret = [[] for i in range(len1)]
    # TODO: here I should revise lrmult to as it was before
    # lrmult = 1.0 # here for simple I only use constant learning rate multiplier
    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult

    #TODO: this point I cannot finally understand it, originally it is
    # ob=U.get_placeholder_cached(name="ob")
    #TODO: here it is a bug to fix, I think the get_placeholder_cached is global, you can only cache observation once and the next time if it finds the name placeholder it will return the previous placeholder, I don't know whether different namescope have  effect on this.
    # ob1 = U.get_placeholder_cached(name="observation1") # Note: I am not sure about this point
    # # ob2 = U.get_placeholder_cached(name="observation2")
    # ob1 = U.get_placeholder_cached(name="observation0")  # Note: I am not sure about this point
    # ob2 = U.get_placeholder_cached(name="observation1")
    #TODO: the only one question now is that pi network and oldpi networ both have the ob_ph named "observation", even in the original baseline implementation, does pi and oldpi share the observation placeholder, I think it is not

    ob = [
        U.get_placeholder_cached(name="observation" + str(i))
        for i in range(len1)
    ]
    # ac = tuple([pi[i].act(stochastic=True, observation=env.observation_space[i])[0]
    #      for i in range(len1)])
    # TODO: here for the policy to work I changed the observation parameter passed into the pi function to s which comes from env.reset()
    # s = env.reset()
    # ac = tuple([pi[i].act(stochastic=True, observation=s[i])[0]
    #             for i in range(len1)])

    ac = [pi[i].pdtype.sample_placeholder([None]) for i in range(len1)]
    kloldnew = [oldpi[i].pd.kl(pi[i].pd) for i in range(len1)]
    ent = [pi[i].pd.entropy() for i in range(len1)]
    print("ent1 and ent2 are {} and {}".format(ent[0], ent[1]))
    meankl = [U.mean(kloldnew[i]) for i in range(len1)]
    meanent = [U.mean(ent[i]) for i in range(len1)]

    pol_entpen = [(-entcoeff) * meanent[i] for i in range(len1)]
    ratio = [
        tf.exp(pi[i].pd.logp(ac[i]) - oldpi[i].pd.logp(ac[i]))
        for i in range(len1)
    ]
    # ratio = [tf.exp(pi[i].pd.logp(ac) - oldpi[i].pd.logp(ac[i])) for i in range(len1)] #pnew / pold
    surr1 = [ratio[i] * atarg[i] for i in range(len1)]
    # U.clip = tf.clip_by_value(t, clip_value_min, clip_value_max,name=None):
    # # among which t is A 'Tensor' so
    surr2 = [
        U.clip(ratio[i], 1.0 - clip_param, 1.0 + clip_param)
        for i in range(len1)
    ]
    pol_surr = [-U.mean(tf.minimum(surr1[i], surr2[i])) for i in range(len1)]
    vf_loss = [U.mean(tf.square(pi[i].vpred - ret[i])) for i in range(len1)]
    total_loss = [
        pol_surr[i] + pol_entpen[i] + vf_loss[i] for i in range(len1)
    ]
    # here I ccome to realize that the following miscelleous losses are just operations not tensors so they should be
    # # be made to a list to contain the info of the two agents
    # surr2 = U.clip(ratio[i], 1.0 - clip_param, 1.0 + clip_param)
    # pol_surr = -U.mean(tf.minimum(surr1[i], surr2[i]))
    # vf_loss = U.mean(tf.square(pi[i].vpred - ret[i]))
    # total_loss = pol_surr + pol_entpen + vf_loss

    #TODO: in another way I choose to revise losses to following:
    losses = [[pol_surr[i], pol_entpen[i], vf_loss[i], meankl[i], meanent[i]]
              for i in range(len1)]
    loss_names = ["pol_sur", "pol_entpen", "vf_loss", "kl", "ent"]
    var_list = [pi[i].get_trainable_variables() for i in range(len1)]

    lossandgrad = [
        U.function([ob[i], ac[i], atarg[i], ret[i], lrmult],
                   losses[i] + [U.flatgrad(total_loss[i], var_list[i])])
        for i in range(len1)
    ]
    adam = [MpiAdam(var_list[i], epsilon=adam_epsilon) for i in range(2)]

    #TODO: I wonder this cannot function as expected because the result is a list of functions, not will not execute automatically
    # assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
    #     for (oldv, newv) in zipsame(oldpi[i].get_variables(), pi[i].get_variables())]) for i in range(len1)]

    # compute_losses is a function, so it should not be copied to copies, nevertheless the parameters should be
    # passed into it as the two agents
    compute_losses = [
        U.function([ob[i], ac[i], atarg[i], ret[i], lrmult], losses[i])
        for i in range(len1)
    ]
    # sess = U.get_session()
    # writer = tf.summary.FileWriter(logdir='log-mlp',graph=sess.graph)
    # now when the training iteration ends, save the trained model and test the win rate of the two.
    pi0_variables = slim.get_variables(scope="pi0")
    pi1_variables = slim.get_variables(scope="pi1")
    parameters_to_save_list0 = [v for v in pi0_variables]
    parameters_to_save_list1 = [v for v in pi1_variables]
    parameters_to_save_list = parameters_to_save_list0 + parameters_to_save_list1
    saver = tf.train.Saver(parameters_to_save_list)
    restore = tf.train.Saver(parameters_to_save_list)
    U.initialize()
    restore.restore(U.get_session(), "parameter/500/500.pkl")
    U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(pi[1].get_variables(), pi[0].get_variables())
        ])()
    U.get_session().run
    # [adam[i].sync() for i in range(2)]
    adam[0].sync()
    adam[1].sync()
    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     horizon=timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()

    lenbuffer = [deque(maxlen=100)
                 for i in range(len1)]  # rolling buffer for episode lengths
    rewbuffer = [deque(maxlen=100)
                 for i in range(len1)]  # rolling buffer for episode rewards

    parameters_savers = []
    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # saver.restore()

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        #TODO: I got to fix this function to let it return the right seg["adv"] and seg["lamret"]
        add_vtarg_and_adv(seg, gamma, lam)

        losses = [[] for i in range(len1)]
        meanlosses = [[] for i in range(len1)]
        for i in range(len1):
            ob[i], ac[i], atarg[i], tdlamret[i] = seg["ob"][i], seg["ac"][
                i], seg["adv"][i], seg["tdlamret"][i]
            # ob_extend = np.expand_dims(ob[i],axis=0)
            # ob[i] = ob_extend
            vpredbefore = seg["vpred"][
                i]  # predicted value function before udpate
            atarg[i] = (atarg[i] - atarg[i].mean()) / atarg[i].std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob[i],
                             ac=ac[i],
                             atarg=atarg[i],
                             vtarg=tdlamret[i]),
                        shuffle=not pi[i].recurrent)
            optim_batchsize = optim_batchsize or ob[i].shape[0]

            if hasattr(pi[i], "ob_rms"):
                pi[i].ob_rms.update(
                    ob[i])  # update running mean/std for policy

            #TODO: I have to make suer how assign_old_ea_new works and whether to assign it for each agent
            #Yes I can assure it will work now

            # save network parameters using tf.train.Saver
            #     saver_name = "saver" + str(iters_so_far)

            U.function([], [],
                       updates=[
                           tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                               oldpi[i].get_variables(), pi[i].get_variables())
                       ])()
            # set old parameter values to new parameter values
            # Here we do a bunch of optimization epochs over the data
            logger.log("Optimizing the agent{}...".format(i))
            logger.log(fmt_row(13, loss_names))
            for _ in range(optim_epochs):
                losses[i] = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    # batch["ob"] = np.expand_dims(batch["ob"], axis=0)
                    *newlosses, g = lossandgrad[i](batch["ob"], batch["ac"],
                                                   batch["atarg"],
                                                   batch["vtarg"], cur_lrmult)
                    adam[i].update(g, optim_stepsize * cur_lrmult)
                    losses[i].append(newlosses)
                    logger.log(fmt_row(13, np.mean(losses[i], axis=0)))

            logger.log("Evaluating losses of agent{}...".format(i))
            losses[i] = []
            for batch in d.iterate_once(optim_batchsize):
                newlosses = compute_losses[i](batch["ob"], batch["ac"],
                                              batch["atarg"], batch["vtarg"],
                                              cur_lrmult)
                losses[i].append(newlosses)
            meanlosses[i], _, _ = mpi_moments(losses[i], axis=0)
            logger.log(fmt_row(13, meanlosses[i]))
            for (lossval, name) in zipsame(meanlosses[i], loss_names):
                logger.record_tabular("loss_" + name, lossval)
            logger.record_tabular("ev_tdlam_before{}".format(i),
                                  explained_variance(vpredbefore, tdlamret[i]))

            lrlocal = (seg["ep_lens"][i], seg["ep_rets"][i])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(flatten_lists, zip(*listoflrpairs))
            lenbuffer[i].extend(lens)
            rewbuffer[i].extend(rews)
            logger.record_tabular("EpLenMean {}".format(i),
                                  np.mean(lenbuffer[i]))
            logger.record_tabular("EpRewMean {}".format(i),
                                  np.mean(rewbuffer[i]))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        temp_pi = policy_func("temp_pi" + str(iters_so_far),
                              ob_space[0],
                              ac_space[0],
                              placeholder_name="temp_pi_observation" +
                              str(iters_so_far))
        U.function([], [],
                   updates=[
                       tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                           temp_pi.get_variables(), pi[0].get_variables())
                   ])()
        parameters_savers.append(temp_pi)

        # now I think when the
        if iters_so_far % 3 == 0:
            sample_iteration = int(
                np.random.uniform(iters_so_far / 2, iters_so_far))
            print("now assign the {}th parameter of agent0 to agent1".format(
                sample_iteration))
            pi_restore = parameters_savers[sample_iteration]
            U.function([], [],
                       updates=[
                           tf.assign(oldv, newv)
                           for (oldv,
                                newv) in zipsame(pi[1].get_variables(),
                                                 pi_restore.get_variables())
                       ])()

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    # # now when the training iteration ends, save the trained model and test the win rate of the two.
    # pi0_variables = slim.get_variables(scope = "pi0")
    # pi1_variables = slim.get_variables(scope = "pi1")
    # parameters_to_save_list0 = [v for v in pi0_variables]
    # parameters_to_save_list1 = [v for v in pi1_variables]
    # parameters_to_save_list = parameters_to_save_list0 + parameters_to_save_list1
    # saver = tf.train.Saver(parameters_to_save_list)
    # parameters_path = 'parameter/'
    # tf.train.Saver()
    save_path = saver.save(U.get_session(), "parameter/800/800.pkl")
def learn(
    # =========== modified part begins =========== #
    env_id,
    seed,
    robot,  # robot class with GMM params
    joint_optimization_iters,  # total number of joint optimization iterations
    design_iters,  # number of samples when updating physical design in each joint optimization iteration
    policy_iters,  # number of samples when updating robot policy in each joint optimization iteration
    # ============ modified part ends ============ #
    policy_func,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):

    # ================================== modification 1 ================================== #
    """
    input:  replace "env" (env class) with "env_id" (string)
            add "seed" (int)
        reason: to enable env.make() during training
        modification detail: add following lines into learn()
            env = gym.make(env_id)
            env = bench.Monitor(env, logger.get_dir())
            env.seed(seed)
            env.close() # added at the end of learn()
    """
    import roboschool, gym
    from baselines import bench
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir())
    env.seed(seed)
    # ================================== modification 1 ================================== #

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    # policy_func is the initialization of NN
    # NN structure:
    #   state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value)
    #       num_hid_layers, hid_size: set in the file calls "learn"
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # placeholder for "ob"
    # created in mlppolicy.py
    ob = U.get_placeholder_cached(name="ob")
    # placeholder for "ac"
    # in common/distribution.py
    ac = pi.pdtype.sample_placeholder([None])

    # KL divergence and Entropy, defined in common/distribution.py
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)

    # pol_entpen: Entropy Bounus encourages exploration
    # entcoeff: entropy coefficient, defined in PPO page 5, Equ. (9)
    pol_entpen = (-entcoeff) * meanent

    # probability ration, defined in PPO page 3
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold

    # Surrogate Goal
    # defined in PPO page 3, Equ (7)
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    # Value Function Loss: square error loss for ||v_pred - v_target||
    vf_loss = U.mean(tf.square(pi.vpred - ret))

    # Total_loss = L^CLIP - Value Function Loss + Entropy Bounus
    # defined in PPO page 5, Equ. (9)
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    # adam optimizer?
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    # oldpi = pi
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    # Why we need this line?
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # ================================== modification 2 ================================== #
    for joint_optimization_iter in range(joint_optimization_iters):
        U.save_state('/home/yetong/Desktop/Project/models/model{}.ckpt'.format(
            joint_optimization_iter))
        logger.log("joint optimization progree: {}/{}".format(
            joint_optimization_iter, joint_optimization_iters))
        # ================================== update physical design ================================== #
        if joint_optimization_iter > 20:
            Rewards_plus = np.zeros(design_iters)
            Rewards_minum = np.zeros(design_iters)
            params = robot.sample(design_iters, to_update=True)
            for i, param in enumerate(params):
                robot.modify_file(param)
                env = gym.make(env_id)
                # myenv = env.env

                # pdb.set_trace()
                env = bench.Monitor(env, logger.get_dir())
                R = episode_generator(pi, env, gamma, stochastic=True)
                logger.log("\t update physical design: %d/%d, rew: %f" %
                           (i, 2 * design_iters, R))
                if i % 2 == 0:
                    Rewards_plus[int(i / 2)] = R
                else:
                    Rewards_minum[int(i / 2)] = R
            logger.log("prev_mu: ", robot.params_mu)
            logger.log("prev_sig: ", robot.params_sig)
            robot.update(Rewards_plus, Rewards_minum)
            logger.log("mu: ", robot.params_mu)
            logger.log("sig: ", robot.params_sig)
        # ================================== update policy ================================== #
        # params = robot.sample(design_iters)
        params = [robot.params_mu]
        for param in params:
            # reinitialize env
            robot.modify_file(param)
            env = gym.make(env_id)
            env = bench.Monitor(env, logger.get_dir())
            # ================================== modification 2 ================================== #

            # Prepare for rollouts
            # ----------------------------------------
            seg_gen = traj_segment_generator(pi,
                                             env,
                                             timesteps_per_actorbatch,
                                             stochastic=True)

            episodes_so_far = 0
            timesteps_so_far = 0
            iters_so_far = 0
            tstart = time.time()
            lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
            rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

            assert sum([
                max_iters > 0, max_timesteps > 0, max_episodes > 0,
                max_seconds > 0
            ]) == 1, "Only one time constraint permitted"

            while True:
                if callback: callback(locals(), globals())
                if max_timesteps and timesteps_so_far >= max_timesteps:
                    break
                elif max_episodes and episodes_so_far >= max_episodes:
                    break
                elif max_iters and iters_so_far >= max_iters:
                    break
                elif max_seconds and time.time() - tstart >= max_seconds:
                    break

                # annealing for stepsize parameters (epsilon and adam)
                if schedule == 'constant':
                    cur_lrmult = 1.0
                elif schedule == 'linear':
                    cur_lrmult = max(
                        1.0 - float(timesteps_so_far) / max_timesteps, 0)
                else:
                    raise NotImplementedError

                logger.log("********** Iteration %i ************" %
                           iters_so_far)

                seg = seg_gen.__next__()
                add_vtarg_and_adv(seg, gamma, lam)

                # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[
                    "adv"], seg["tdlamret"]
                vpredbefore = seg[
                    "vpred"]  # predicted value function before udpate
                atarg = (atarg - atarg.mean()) / atarg.std(
                )  # standardized advantage function estimate
                d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                            shuffle=not pi.recurrent)
                optim_batchsize = optim_batchsize or ob.shape[0]

                if hasattr(pi, "ob_rms"):
                    pi.ob_rms.update(ob)  # update running mean/std for policy

                # oldpi = pi
                # set old parameter values to new parameter values
                assign_old_eq_new()
                logger.log("Optimizing...")
                logger.log(fmt_row(13, loss_names))
                # Here we do a bunch of optimization epochs over the data
                for _ in range(optim_epochs):
                    losses = [
                    ]  # list of tuples, each of which gives the loss for a minibatch
                    for batch in d.iterate_once(optim_batchsize):
                        *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult)
                        adam.update(g, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)
                    logger.log(fmt_row(13, np.mean(losses, axis=0)))

                logger.log("Evaluating losses...")
                losses = []
                for batch in d.iterate_once(optim_batchsize):
                    newlosses = compute_losses(batch["ob"], batch["ac"],
                                               batch["atarg"], batch["vtarg"],
                                               cur_lrmult)
                    losses.append(newlosses)
                meanlosses, _, _ = mpi_moments(losses, axis=0)
                logger.log(fmt_row(13, meanlosses))
                for (lossval, name) in zipsame(meanlosses, loss_names):
                    logger.record_tabular("loss_" + name, lossval)
                logger.record_tabular(
                    "ev_tdlam_before",
                    explained_variance(vpredbefore, tdlamret))
                lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
                listoflrpairs = MPI.COMM_WORLD.allgather(
                    lrlocal)  # list of tuples
                lens, rews = map(flatten_lists, zip(*listoflrpairs))
                lenbuffer.extend(lens)
                rewbuffer.extend(rews)
                logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                logger.record_tabular("EpThisIter", len(lens))
                episodes_so_far += len(lens)
                timesteps_so_far += sum(lens)
                iters_so_far += 1
                logger.record_tabular("EpisodesSoFar", episodes_so_far)
                logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                logger.record_tabular("TimeElapsed", time.time() - tstart)
                if MPI.COMM_WORLD.Get_rank() == 0:
                    logger.dump_tabular()

    # ================================== modification 1 ================================== #
    env.close()
Exemple #28
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    num_options=1,
    app='',
    saves=False,
    wsaves=False,
    epoch=0,
    seed=1,
    dc=0,
    plots=False,
    w_intfc=True,
    switch=False,
    intlr=1e-4,
    piolr=1e-4,
    fewshot=False,
):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)

    ### Book-keeping
    if hasattr(env, 'NAME'):
        gamename = env.NAME.lower()
    else:
        gamename = env.spec.id[:-3].lower()

    gamename += 'seed' + str(seed)
    gamename += app

    dirname = '{}_{}opts_saves/'.format(gamename, num_options)

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'cnn_policy.py', 'run_miniw.py']
        for i in range(len(files)):
            src = os.path.expanduser(
                '~/baselines_intfc/baselines/ppoc_int/') + files[i]
            dest = os.path.expanduser(
                '~/baselines_intfc/baselines/ppoc_int/') + dirname
            shutil.copy2(src, dest)
    ###

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])
    op_adv = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    betas = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    # pi_w = tf.stop_gradient(pi.op_pi)
    pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    option_hot = tf.one_hot(option, depth=num_options)
    pi_I = pi.intfc * pi_w / tf.expand_dims(
        tf.reduce_sum(pi.intfc * pi_w, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    int_loss = -tf.reduce_sum(
        betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_I = intfc * pi.op_pi / tf.expand_dims(
        tf.reduce_sum(intfc * pi.op_pi, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    op_loss = -tf.reduce_sum(
        betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1)
    op_loss -= 0.01 * tf.reduce_sum(op_entropy)

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option],
                             losses + [U.flatgrad(total_loss, var_list)])
    lossandgrad_vf = U.function([ob, ac, atarg, ret, lrmult, option],
                                losses + [U.flatgrad(vf_loss, var_list)])
    termgrad = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    opgrad = U.function([ob, option, betas, op_adv, intfc],
                        [U.flatgrad(op_loss, var_list)
                         ])  # Since we will use a different step size.
    intgrad = U.function([ob, option, betas, op_adv, pi_w],
                         [U.flatgrad(int_loss, var_list)
                          ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    saver = tf.train.Saver(max_to_keep=10000)

    ### More book-kepping
    results = []
    if saves:
        directory_res = "res_switch150/learnpio/lr{}/".format(
            optim_stepsize) if not fewshot else "res_fewshot/lr{}/".format(
                optim_stepsize)
        if not os.path.exists(directory_res):
            os.makedirs(directory_res)
        if w_intfc:
            results = open(
                directory_res + gamename +
                '_intfc{}_intlr{}_piolr{}'.format(int(w_intfc), intlr, piolr) +
                '_' + str(num_options) + 'opts.csv', 'w')
        else:
            results = open(
                directory_res + gamename +
                '_intfc{}_piolr{}'.format(int(w_intfc), piolr) + '_' +
                str(num_options) + 'opts.csv', 'w')
        out = 'epoch,avg_reward'

        # for opt in range(num_options): out += ',option {} dur'.format(opt)
        # # for opt in range(num_options): out += ',option {} std'.format(opt)
        # for opt in range(num_options): out += ',option {} term'.format(opt)
        # for opt in range(num_options): out += ',option {} adv'.format(opt)
        out += '\n'
        results.write(out)
        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch:

        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)
    ###

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=10)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=10)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc,
                                     epoch=epoch,
                                     seed=seed,
                                     plots=plots,
                                     w_intfc=w_intfc,
                                     switch=switch)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam, num_options)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        print("mean opt dur:", opt_d)
        print("mean op probs:", np.mean(np.array(seg['op_probs']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean vpreds:", np.mean(np.array(seg['vpred']), axis=0))

        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if iters_so_far % 5 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        min_batch = 160  # Arbitrary and this is the main issue for multi option (or options in general)
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            ########## This part is only necessary when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue
                    # The preivous dataset has already been trained on (datas[opt].n > min_batch), so we replace it,
                    # and continue without training, as indices.size is too small (indices.size < min_batch).
                    # A too small dataset causes divergence.

                ##################################################
                elif indices.size + datas[opt].n < min_batch:
                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue
                    # The preivous dataset hasn't been trained on (datas[opt].n < min_batch), so we concatenante with new samples.
                    # The combination of both (indices.size + datas[opt].n < min_batch) is still insufficient, so we skip training.
                    # A too small dataset causes divergence.

                ###################################################
                elif (indices.size + datas[opt].n > min_batch
                      and datas[opt].n < min_batch):
                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)
                    # The preivous dataset hasn't been trained on (datas[opt].n < min_batch), so we concatenante with new samples.
                    # The combination of both (indices.size + datas[opt].n < min_batch) is sufficient for training.

                ##################################################
                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)
                    # The preivous dataset has already been trained on (datas[opt].n > min_batch), so we replace it.
                    # The new samples are numerous enough (indices.size > min_batch), so we use them for training.

                ##################################################
            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                # Only useful for the very first iteration of the training process.
            #########

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    # tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt])
                    # tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    # t_advs[opt].append(nodc_adv)
                    if iters_so_far < 150 or not fewshot:
                        *newlosses, grads = lossandgrad(
                            batch["ob"], batch["ac"], batch["atarg"],
                            batch["vtarg"], cur_lrmult, [opt])
                        adam.update(grads, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)
                    else:
                        *newlosses, grads = lossandgrad_vf(
                            batch["ob"], batch["ac"], batch["atarg"],
                            batch["vtarg"], cur_lrmult, [opt])
                        adam.update(grads, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)

        if iters_so_far < 150 or not fewshot:
            termg = termgrad(seg["ob"], seg['opts'], seg["op_adv"])[0]
            adam.update(termg, 5e-7)

            if w_intfc:
                intgrads = intgrad(seg['ob'], seg['opts'], seg["last_betas"],
                                   seg["op_adv"], seg["op_probs"])[0]
                adam.update(intgrads, intlr)

        opgrad = intgrad(seg['ob'], seg['opts'], seg["last_betas"],
                         seg["op_adv"], seg["intfc"])[0]
        adam.update(opgrad, piolr)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            # for _ in range(num_options): out+=",{},{},{}"
            out += "\n"
            # pdb.set_trace()

            info = [iters_so_far, np.mean(rewbuffer)]

            results.write(out.format(*info))
            results.flush()