コード例 #1
0
def learn(
    env,
    test_env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------

    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    # import numpy as np
    # print(np.random.get_state()[1][0])
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter episloni

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    normalizer = Normalizer(1)
    # Prepare for rollouts
    # ----------------------------------------
    eval_gen = traj_segment_generator_eval(pi,
                                           test_env,
                                           timesteps_per_actorbatch,
                                           stochastic=False)
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     normalizer=normalizer)

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, best_fitness
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        eval_seg = eval_gen.__next__()
        lrlocal = (eval_seg["ep_lens"], eval_seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        if timesteps_so_far == 0:
            result_record()

        seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            # logger.log(fmt_row(13, np.mean(losses, axis=0)))
        # logger.log("Current Iteration Training Performance:" + str(np.mean(seg["ep_rets"])))
        # logger.log("Evaluating losses...")
        # losses = []
        # for batch in d.iterate_once(optim_batchsize):
        #     newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
        #     losses.append(newlosses)
        # meanlosses,_,_ = mpi_moments(losses, axis=0)
        # logger.log(fmt_row(13, meanlosses))
        # for (lossval, name) in zipsame(meanlosses, loss_names):
        #     logger.record_tabular("loss_"+name, lossval)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        # logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        # logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        # timesteps_so_far += sum(lens)
        # if iters_so_far == 0:
        #     result_record()
        iters_so_far += 1
コード例 #2
0
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    shift=0,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    td_v_target = tf.placeholder(dtype=tf.float32, shape=[1, 1])  # V target

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    G_t_inv = tf.placeholder(dtype=tf.float32, shape=[None, None])
    alpha = tf.placeholder(dtype=tf.float32, shape=[1])
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([])
    adv = tf.placeholder(dtype=tf.float32, shape=[1, 1])
    step = tf.placeholder(dtype=tf.float32, shape=[1])

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - td_v_target))
    vf_losses = [vf_loss]
    vf_loss_names = ["vf_loss"]

    pol_loss = -tf.reduce_mean(adv * pi.pd.logp(ac))
    pol_losses = [pol_loss]
    pol_loss_names = ["pol_loss"]

    var_list = pi.get_trainable_variables()
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
    ]

    compatible_feature = U.flatgrad(pi.pd.neglogp(ac), pol_var_list)

    G_t_inv_next = 1 / (
        1 - alpha) * (G_t_inv - alpha * (G_t_inv * compatible_feature) *
                      tf.transpose(G_t_inv * compatible_feature) /
                      (1 - alpha + alpha * tf.transpose(compatible_feature) *
                       G_t_inv * compatible_feature))

    # Train V function
    vf_lossandgrad = U.function([ob, td_v_target, lrmult], vf_losses +
                                [U.flatgrad(vf_loss, vf_var_list, 20.0)])
    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)

    # vf_optimizer = tf.train.AdamOptimizer(learning_rate = lrmult, epsilon = adam_epsilon)
    # vf_train_op = vf_optimizer.minimize(vf_loss, vf_var_list)

    # Train Policy
    pol_lossandgrad = U.function([ob, ac, adv, lrmult, td_v_target],
                                 pol_losses +
                                 [U.flatgrad(pol_loss, pol_var_list, 20.0)])
    pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon)

    # pol_optimizer = tf.train.AdamOptimizer(learning_rate = 0.1 * lrmult, epsilon = adam_epsilon)
    # pol_train_op = pol_optimizer.minimize(pol_loss, pol_var_list)

    # Computation
    compute_v_pred = U.function([ob], [pi.vpred])
    get_pol_weights_num = np.sum(
        [np.prod(v.get_shape().as_list()) for v in pol_var_list])
    get_compatible_feature = U.function([ob, ac], [compatible_feature])
    # vf_update = U.function([ob, td_v_target], [vf_train_op])
    # pol_update = U.function([ob, ac, adv], [pol_train_op])

    U.initialize()
    vf_adam.sync()
    pol_adam.sync()
    # Prepare for rollouts
    # ----------------------------------------

    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=False)

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, best_fitness
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    Transition = collections.namedtuple(
        "Transition", ["ob", "ac", "reward", "next_ob", "done"])

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Step learning, this loop now indicates episodes
    omega_t = np.zeros(get_pol_weights_num)
    normalizer = Normalizer(1)
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # logger.log("********** Episode %i ************" % episodes_so_far)

        rac_alpha = optim_stepsize * cur_lrmult
        rac_beta = optim_stepsize * cur_lrmult * 0.05
        #
        # print("rac_alpha=", rac_alpha)
        # print("rac_beta=", rac_beta)
        if timesteps_so_far == 0:
            # result_record()
            seg = seg_gen.__next__()
            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(flatten_lists, zip(*listoflrpairs))
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
            result_record()

        ob = env.reset()
        # episode = []
        cur_ep_ret = 0  # return in current episode
        cur_ep_len = 0  # len of current episode
        ep_rets = []  # returns of completed episodes in this segment
        ep_lens = []  # lengths of ...

        obs = []
        record = False
        for t in itertools.count():
            ac, vpred = pi.act(stochastic=True, ob=ob)
            origin_ac = ac
            ac = np.clip(ac, ac_space.low, ac_space.high)
            obs.append(ob)
            next_ob, rew, done, _ = env.step(ac)
            if env.spec._env_name == "MountainCarContinuous":
                rew = rew - np.abs(next_ob[0] - env.unwrapped.goal_position)
            ac = origin_ac

            # rew = np.clip(rew, -1., 1.)
            # episode.append(Transition(ob=ob.reshape((1, ob.shape[0])), ac=ac.reshape((1, ac.shape[0])), reward=rew, next_ob=next_ob.reshape((1, ob.shape[0])), done=done))

            original_rew = rew
            if env.spec._env_name != "InvertedPendulumBulletEnv":
                normalizer.update(rew)
                rew = normalizer.normalize(rew)
            cur_ep_ret += (original_rew - shift)
            cur_ep_len += 1
            timesteps_so_far += 1
            # Compute v target and TD
            v_target = rew + gamma * np.array(
                compute_v_pred(next_ob.reshape((1, ob.shape[0]))))
            adv = v_target - np.array(
                compute_v_pred(ob.reshape((1, ob.shape[0]))))

            # Update V and Update Policy
            vf_loss, vf_g = vf_lossandgrad(ob.reshape((1, ob.shape[0])),
                                           v_target, rac_alpha)
            vf_adam.update(vf_g, rac_alpha)
            pol_loss, pol_g = pol_lossandgrad(ob.reshape((1, ob.shape[0])), ac,
                                              adv, rac_beta)
            compatible_feature = np.array(
                get_compatible_feature(ob.reshape((1, ob.shape[0])), ac))
            compatible_feature_product = compatible_feature * compatible_feature.T
            omega_t = (np.eye(compatible_feature_product.shape[0]) - 0.1 * rac_alpha * compatible_feature_product).dot(
                omega_t) \
                      + 0.1 * rac_alpha * pol_g

            pol_adam.update(omega_t, rac_beta)
            ob = next_ob
            if timesteps_so_far % 10000 == 0:
                record = True
            if done:
                # print(
                #     "Episode {} - Total reward = {}, Total Steps = {}".format(episodes_so_far, cur_ep_ret, cur_ep_len))
                # ep_rets.append(cur_ep_ret)  # returns of completed episodes in this segment
                # ep_lens.append(cur_ep_len)  # lengths of ..

                # lenbuffer.append(cur_ep_len)
                # rewbuffer.append(cur_ep_ret)

                if hasattr(pi, "ob_rms"):
                    pi.ob_rms.update(np.array(
                        obs))  # update running mean/std for normalization
                iters_so_far += 1
                episodes_so_far += 1
                ob = env.reset()
                if record:
                    seg = seg_gen.__next__()
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
                    listoflrpairs = MPI.COMM_WORLD.allgather(
                        lrlocal)  # list of tuples
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)
                    result_record()
                    record = False
                break
コード例 #3
0
def train(env, policy, normalizer, hp):
    global lenbuffer, rewbuffer, iters_so_far, timesteps_so_far, \
        episodes_so_far, tstart
    tstart = time.time()
    rewbuffer.extend(evaluate(env, normalizer, policy))
    # print(rewbuffer)
    result_record()
    record = False
    rw_normalizer = Normalizer(1)
    for episode in range(hp.main_loop_size):
        cur_lrmult = 1.0
        # cur_lrmult = max(1.0 - float(timesteps_so_far) / (0.5 * hp.max_timesteps), 1e-8)
        if timesteps_so_far >= hp.max_timesteps:
            result_record()
            break
        # init deltas and rewards
        deltas = policy.sample_deltas()
        reward_positive = [0] * hp.n_directions
        reward_negative = [0] * hp.n_directions

        record = False

        # positive directions
        for k in range(hp.n_directions):
            state = env.reset()
            done = False
            num_plays = 0.
            while not done and num_plays < hp.horizon:
                normalizer.observe(state)
                state = normalizer.normalize(state)
                action = policy.positive_perturbation(state, deltas[k])
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
                state, reward, done, _ = env.step(action)
                # reward = max(min(reward, 1), -1)
                if env.spec._env_name != "InvertedPendulumBulletEnv":
                    rw_normalizer.update(reward)
                    reward = rw_normalizer.normalize(reward)
                reward_positive[k] += reward
                num_plays += 1
                timesteps_so_far += 1
                if timesteps_so_far % 10000 == 0 and timesteps_so_far > 0:
                    record = True
            episodes_so_far += 1
            if record:
                # print(total_steps)
                rewbuffer.extend(evaluate(env, normalizer, policy))
                # print(rewbuffer)
                # print("Averge Rewards:", np.mean(rewbuffer))
                result_record()
                record = False
        # negative directions
        for k in range(hp.n_directions):
            state = env.reset()
            done = False
            num_plays = 0.
            while not done and num_plays < hp.horizon:
                normalizer.observe(state)
                state = normalizer.normalize(state)
                action = policy.negative_perturbation(state, deltas[k])
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
                state, reward, done, _ = env.step(action)
                # reward = max(min(reward, 1), -1)
                if env.spec._env_name != "InvertedPendulumBulletEnv":
                    rw_normalizer.update(reward)
                    reward = rw_normalizer.normalize(reward)
                reward_negative[k] += reward
                num_plays += 1
                timesteps_so_far += 1
                if timesteps_so_far % 10000 == 0 and timesteps_so_far > 0:
                    record = True
            episodes_so_far += 1
            if record:
                # print(total_steps)
                # print(rewbuffer)
                rewbuffer.extend(evaluate(env, normalizer, policy))
                # print("Averge Rewards:", np.mean(rewbuffer))
                result_record()
                record = False
        all_rewards = np.array(reward_negative + reward_positive)
        sigma_r = all_rewards.std()

        # sort rollouts wrt max(r_pos, r_neg) and take (hp.b) best
        scores = {
            k: max(r_pos, r_neg)
            for k, (r_pos,
                    r_neg) in enumerate(zip(reward_positive, reward_negative))
        }
        order = sorted(scores.keys(), key=lambda x: scores[x])[-hp.b:]
        rollouts = [(reward_positive[k], reward_negative[k], deltas[k])
                    for k in order[::-1]]

        hp.step_size = hp.step_size * cur_lrmult
        # update policy:
        policy.update(rollouts, sigma_r)