コード例 #1
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_batch,  # what to train on
        epsilon,
        beta,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        TRPO=False):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    discrete_ac_space = isinstance(ac_space, gym.spaces.Discrete)
    print("ob_space: " + str(ob_space))
    print("ac_space: " + str(ac_space))
    pi = policy_fn("pi", ob_space, ac_space)
    oldpi = policy_fn("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    old_entropy = oldpi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "Entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    all_var_list = [
        v for v in all_var_list if v.name.split("/")[0].startswith("pi")
    ]
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    #????gvp and fvp???
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Initialize eta, omega optimizer
    if discrete_ac_space:
        init_eta = 1
        init_omega = 0.5
        eta_omega_optimizer = EtaOmegaOptimizerDiscrete(
            beta, epsilon, init_eta, init_omega)
    else:
        init_eta = 0.5
        init_omega = 2.0
        #????eta_omega_optimizer details?????
        eta_omega_optimizer = EtaOmegaOptimizer(beta, epsilon, init_eta,
                                                init_omega)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        #print(ob[:20])
        #print(ac[:20])

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            print(pi.ob_rms.mean)
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()

            if TRPO:
                #
                # TRPO specific code.
                # Find correct step size using line search
                #
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / epsilon)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > epsilon * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                '''else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)'''
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

            else:
                #
                # COPOS specific implementation.
                #
                copos_update_dir = stepdir

                # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts
                w_theta, w_beta = pi.split_w(copos_update_dir)

                tmp_ob = np.zeros(
                    (1, ) + env.observation_space.shape
                )  # We assume that entropy does not depend on the NN

                # Optimize eta and omega
                if discrete_ac_space:
                    entropy = lossbefore[4]
                    #entropy = - 1/timesteps_per_batch * np.sum(np.sum(pi.get_action_prob(ob) * pi.get_log_action_prob(ob), axis=1))
                    eta, omega = eta_omega_optimizer.optimize(
                        pi.compute_F_w(ob, copos_update_dir),
                        pi.get_log_action_prob(ob), timesteps_per_batch,
                        entropy)
                else:
                    Waa, Wsa = pi.w2W(w_theta)
                    wa = pi.get_wa(ob, w_beta)

                    varphis = pi.get_varphis(ob)

                    #old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0]
                    old_ent = lossbefore[4]
                    eta, omega = eta_omega_optimizer.optimize(
                        w_theta, Waa, Wsa, wa, varphis, pi.get_kt(),
                        pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent)
                logger.log("Initial eta: " + str(eta) + " and omega: " +
                           str(omega))

                current_theta_beta = get_flat()
                prev_theta, prev_beta = pi.all_to_theta_beta(
                    current_theta_beta)

                if discrete_ac_space:
                    # Do a line search for both theta and beta parameters by adjusting only eta
                    eta = eta_search(w_theta, w_beta, eta, omega, allmean,
                                     compute_losses, get_flat, set_from_flat,
                                     pi, epsilon, args, discrete_ac_space)
                    logger.log("Updated eta, eta: " + str(eta))
                    set_from_flat(pi.theta_beta_to_all(prev_theta, prev_beta))
                    # Find proper omega for new eta. Use old policy parameters first.
                    eta, omega = eta_omega_optimizer.optimize(
                        pi.compute_F_w(ob, copos_update_dir),
                        pi.get_log_action_prob(ob), timesteps_per_batch,
                        entropy, eta)
                    logger.log("Updated omega, eta: " + str(eta) +
                               " and omega: " + str(omega))

                    # do line search for ratio for non-linear "beta" parameter values
                    #ratio = beta_ratio_line_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi,
                    #                     epsilon, beta, args)
                    # set ratio to 1 if we do not use beta ratio line search
                    ratio = 1
                    #print("ratio from line search: " + str(ratio))
                    cur_theta = (eta * prev_theta +
                                 w_theta.reshape(-1, )) / (eta + omega)
                    cur_beta = prev_beta + ratio * w_beta.reshape(-1, ) / eta
                else:
                    for i in range(2):
                        # Do a line search for both theta and beta parameters by adjusting only eta
                        eta = eta_search(w_theta, w_beta, eta, omega, allmean,
                                         compute_losses, get_flat,
                                         set_from_flat, pi, epsilon, args)
                        logger.log("Updated eta, eta: " + str(eta) +
                                   " and omega: " + str(omega))

                        # Find proper omega for new eta. Use old policy parameters first.
                        set_from_flat(
                            pi.theta_beta_to_all(prev_theta, prev_beta))
                        eta, omega = \
                            eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(),
                                                         pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta)
                        logger.log("Updated omega, eta: " + str(eta) +
                                   " and omega: " + str(omega))

                    # Use final policy
                    logger.log("Final eta: " + str(eta) + " and omega: " +
                               str(omega))
                    cur_theta = (eta * prev_theta +
                                 w_theta.reshape(-1, )) / (eta + omega)
                    cur_beta = prev_beta + w_beta.reshape(-1, ) / eta

                paramnew = allmean(pi.theta_beta_to_all(cur_theta, cur_beta))
                set_from_flat(paramnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (paramnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
                ##copos specific over
#cg over

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)


#policy update over
        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        print("Reward max: " + str(max(rewbuffer)))
        print("Reward min: " + str(min(rewbuffer)))

        logger.record_tabular(
            "EpLenMean",
            np.mean(lenbuffer) if np.sum(lenbuffer) != 0.0 else 0.0)
        logger.record_tabular(
            "EpRewMean",
            np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0)
        logger.record_tabular(
            "AverageReturn",
            np.mean(rewbuffer) if np.sum(rewbuffer) != 0.0 else 0.0)
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
コード例 #2
0
ファイル: trpo_vrep.py プロジェクト: zijingchi/baselines
def learn(env,
          policy_func,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          task_name,
          gamma,
          lam,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=1e-4,
          vf_iters=3,
          max_timesteps=0,
          max_episodes=0,
          max_iters=0,
          callback=None):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func(
        "pi",
        ob_space,
        ac_space,
    )  # reuse=(pretrained_weight != None)
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    #ob = U.get_placeholder_cached(name="ob")
    ob_config = U.get_placeholder_cached(name="ob")
    ob_target = U.get_placeholder_cached(name="goal")
    obs_pos = U.get_placeholder_cached(name="obs_pos")
    #obs_pos2 = U.get_placeholder_cached(name="obs_pos2")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.startswith("pi/pol")
        or v.name.startswith("pi/logstd") or v.name.startswith("pi/obs")
    ]
    vf_var_list = [
        v for v in all_var_list
        if v.name.startswith("pi/vf") or v.name.startswith("pi/obs")
    ]

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob_config, ob_target, obs_pos, ac, atarg],
                                losses)
    compute_lossandgrad = U.function(
        [ob_config, ob_target, obs_pos, ac, atarg],
        losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function(
        [flat_tangent, ob_config, ob_target, obs_pos, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob_config, ob_target, obs_pos, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)
    max_trm = -5
    true_reward_mean = 0
    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        #U.load_variables(pretrained_weight, variables=pi.get_variables())
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), pretrained_weight)

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and ckpt_dir is not None and true_reward_mean > max_trm:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)
            max_trm = true_reward_mean

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            v1 = allmean(compute_fvp(p, *fvpargs))
            # print("norm(v1):%.2e, norm(p):%.2e, cg_damping:%.2e"%(np.linalg.norm(v1), np.linalg.norm(p), cg_damping))
            return v1 + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy
            config, goal, obstacle_pos = [], [], []
            for o in seg["ob"]:
                config.append(o["joint"])
                goal.append(o["target"])
                obstacle_pos.append(o["obstacle_pos1"])
                #obstacle_pos2.append(o["obstacle_pos2"])
            config, goal, obstacle_pos = map(np.array,
                                             [config, goal, obstacle_pos])
            args = config, goal, obstacle_pos, seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                    logger.log(
                        'iter:{:d}, norm of g: {:.4f}, error of cg: {:.4f}'.
                        format(
                            cg_iters, np.linalg.norm(g),
                            np.linalg.norm(g -
                                           compute_fvp(stepdir, *fvpargs))))
                assert np.isfinite(stepdir).all()

                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbg, mbop, mbret) in dataset.iterbatches(
                        (config, goal, obstacle_pos, seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(
                            compute_vflossandgrad(mbob, mbg, mbop, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        true_reward_mean = np.mean(true_rewbuffer)
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
コード例 #3
0
ファイル: remps_runner.py プロジェクト: stjordanis/remps
def train(
    env: ConfMDP,
    policy: Policy,
    model_approximator: ModelApproximator,
    eval_steps: int = 4,
    eval_freq: int = 5,
    n_trajectories: int = 20,
    iteration_number: int = 2000,
    gamma: float = 1,
    render=False,
    checkpoint_file: str = "tf_checkpoint/general/model.ckpt",
    restore_variables: bool = False,
    save_variables: bool = True,
    logdir: str = None,
    log: bool = False,
    omega=5,
    kappa: float = 1e-5,
    training_set_size: int = 500,
    normalize_data: bool = False,
    dual_reg: float = 0.0,
    policy_reg: float = 0.0,
    exact: bool = False,
    num_processes: int = 1,
    load_data: bool = True,
    **kwargs,
):
    """
    Runner for the REMPS algorithm.
    Setup logging, initialize agent, takes care of fitting or loading things.
    Executes the main training loop by managing workers
    :param env: Environment (Conf-MDP)
    :param policy: The agent policy
    :param model_approximator: the approximation of the model or the true model
    :param eval_steps: how many steps in order to perform evaluation
    :param eval_freq: the frequency of evaluation
    :param n_trajectories: number of trajectories to collect
    :param iteration_number: number of iterations of REMPS
    :param gamma: discount factor
    :param render: render or not episodes
    :param checkpoint_file: where to store checkpoints
    :param restore_variables: restore variables or not from checkpoint
    :param save_variables: save variables in checkpoint
    :param logdir: directory containing logs
    :param log: if true the agents logs the actions probability
    :param omega: initial environment parameters
    :param kappa: parameter of remps environment
    :param training_set_size: number of samples contained in the training set
    :param normalize_data: Whether to normalize data from the training set
    :param dual_reg: regularization on the dual
    :param policy_reg: regularization on the policy
    :param exact: whether the model approximation is exact or not
    :param num_processes: number of processing
    :param load_data: whether to load stored data
    :param kwargs:
    :return:
    """

    # setup logging
    writer = tf.summary.FileWriter(logdir)
    logger.configure(dir=logdir, format_strs=["stdout", "csv"])

    # setup agent
    agent = REMPS(
        policy=policy,
        model=model_approximator,
        env=env,
        kappa=kappa,
        projection_type=Projection.STATE_KERNEL,
        use_features=False,
        training_set_size=training_set_size,
        L2_reg_dual=dual_reg,
        L2_reg_loss=policy_reg,
        exact=exact,
    )

    # create parallel samplers
    # Split work among workers
    n_steps = n_trajectories
    nb_episodes_per_worker = n_steps // num_processes

    inputQs = [Queue() for _ in range(num_processes)]
    outputQ = Queue()
    workers = [
        SamplingWorker(
            policy,
            env,
            nb_episodes_per_worker,
            inputQs[i],
            outputQ,
            env.action_space.n,
            env.observation_space_size,
        )
        for i in range(num_processes)
    ]

    # Start the workers
    for w in workers:
        w.start()

    # Collect data for model fitting
    # torcs model fitting needs to be done before the session initialization
    # due to multiprocessing issues
    if not load_data and isinstance(env, Torcs):
        if isinstance(env, Torcs):
            x, y, avg_rew, ret = collect_data(
                env,
                policy=policy,
                total_n_samples=training_set_size,
                n_params=2,
                initial_port=env.port + 1000,
            )
            logger.log(
                f"Data collection terminate. Avg rew: {np.mean(avg_rew)}, Avg ret: {np.mean(ret)}",
                logger.INFO,
            )

    with U.single_threaded_session() as sess:

        # initialization with session
        agent.initialize(sess, writer, omega)

        # to save variables
        saver = tf.train.Saver()

        # initialize all
        if restore_variables:
            # Add ops to save and restore all the variables.
            saver.restore(sess, tf.train.latest_checkpoint(checkpoint_file))
        else:
            init = tf.global_variables_initializer()
            sess.run(init)

        # make sure all variables are initialized
        sess.run(tf.assert_variables_initialized())

        logger.log("Collecting Data", level=logger.INFO)

        if not load_data and not isinstance(env, Torcs):
            x, y = run_env(
                env,
                episode_count=1,
                bins=200,
                omega_max=30,
                omega_min=1,
                n_samples_per_omega=500,
                policy=agent,
                grid=True,
                total_n_samples=training_set_size,
            )

            # store data in the agent
            agent.store_data(x, y, normalize_data)

            logger.log("Data Stored", logger.INFO)

        # fit the model
        agent.fit()

        logger.log("Model fitted", logger.INFO)

        # set configurable parameters
        env.set_params(omega)

        get_parameters = U.GetFlat(agent.get_policy_params())

        # -------------------------------------
        # --------- Training Loop -------------
        # -------------------------------------

        for n in range(iteration_number):
            states = list()
            next_states = list()
            rewards = list()
            actions_one_hot = list()
            actions = list()
            timesteps = list()
            paths = list()

            # statistics
            wins = 0
            small_vel = 0
            traj = 0
            confort_violation = 0
            reward_list = list()
            policy_ws = get_parameters()

            # Run parallel sampling:
            # for each worker send message sample with
            # policy weights and environment parameters
            for i in range(num_processes):
                inputQs[i].put(("sample", policy_ws, omega))

            # Collect results when ready
            with timed("sampling"):
                for i in range(num_processes):
                    _, stats = outputQ.get()
                    states.extend(stats["states"])
                    paths.extend(stats["paths"])
                    next_states.extend(stats["next_states"])
                    rewards.extend(stats["rewards"])
                    actions_one_hot.extend(stats["actions_one_hot"])
                    actions.extend(stats["actions"])
                    timesteps.extend(stats["timesteps"])
                    reward_list.extend(stats["reward_list"])
                    wins += stats["wins"]
                    small_vel += stats["small_vel"]
                    traj += stats["traj"]
                    confort_violation += stats["confort_violation"]

            samples_data = {
                "actions": np.matrix(actions).transpose(),
                "actions_one_hot": np.array(actions_one_hot),
                "observations": states,
                "paths": paths,
                "rewards": np.transpose(np.expand_dims(np.array(rewards), axis=0)),
                "reward_list": reward_list,
                "timesteps": timesteps,
                "wins": (wins / traj) * 100,
                "omega": omega,
                "traj": traj,
                "confort_violation": confort_violation,
            }

            # print statistics
            logger.log(f"Training steps: {n}", logger.INFO)
            logger.log(f"Number of wins: {wins}", logger.INFO)
            logger.log(f"Percentage of wins: {(wins/n_trajectories)*100}", logger.INFO)
            logger.log(f"Average reward: {np.mean(reward_list)}", logger.INFO)
            logger.log(f"Avg timesteps: {np.mean(timesteps)}")

            # learning routine
            with timed("training"):
                omega = agent.train(samples_data)

            # Configure environments with
            # parameters returned by the agent
            env.set_params(omega)

            # Only TORCS: we kill torcs every 10 iterations due to a memory leak
            if n % 10 == 0 and isinstance(env, Torcs):
                print("Killing torcs")
                os.system("ps | grep torcs | awk '{print $1}' | xargs kill -9")

            # -------------------------------------
            # --------- Evaluation ----------------
            # -------------------------------------
            if ((n + 1) % eval_freq) == 0:

                # for plotting
                eval_rewards = []

                # evaluation loop
                for i in range(eval_steps):

                    logger.log("Evaluating...", logger.INFO)
                    state = env.reset()
                    done = False
                    # gamma_cum is gamma^t
                    gamma_cum = 1
                    cum_reward = 0
                    t = 0

                    # here starts an episode
                    while not done:

                        if render:
                            env.render()

                            # sample one action at random
                        action = agent.pi(state[np.newaxis, :], log=log)

                        # observe the next state, reward etc
                        newState, reward, done, info = env.step(action)

                        cum_reward += reward * gamma_cum
                        gamma_cum = gamma * gamma_cum

                        state = newState

                        if done:
                            break

                        t = t + 1

                    eval_rewards.append(cum_reward)

                # save variables
                if save_variables:
                    save_path = saver.save(sess, checkpoint_file)
                    logger.log(f"Steps: {n}", logger.INFO)
                    logger.log(f"Model saved in path: {save_path}", logger.INFO)

        # Close the env
        env.close()

        # save variables
        if save_variables:
            save_path = saver.save(sess, checkpoint_file)
            logger.log(f"Model saved in path: {save_path}")

        # exit workers
        for i in range(num_processes):
            inputQs[i].put(("exit", None, None))
コード例 #4
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space)
    oldpi = policy_fn("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    eval_seq = traj_segment_generator_eval(pi,
                                           env,
                                           timesteps_per_batch,
                                           stochastic=False)
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     eval_seq=eval_seq)

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, best_fitness
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)
        if iters_so_far == 0:
            eval_seg = eval_seq.__next__()
            rewbuffer.extend(eval_seg["ep_rets"])
            lenbuffer.extend(eval_seg["ep_lens"])
            result_record()

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        # for (lossname, lossval) in zip(loss_names, meanlosses):
        #     logger.record_tabular(lossname, lossval)

        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        #
        # lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        # listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        # lens, rews = map(flatten_lists, zip(*listoflrpairs))
        # lenbuffer.extend(lens)
        # rewbuffer.extend(rews)
        #
        # logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        # logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))
        # episodes_so_far += len(lens)
        # timesteps_so_far += sum(lens)
        iters_so_far += 1
コード例 #5
0
def train(env,
          nb_epochs,
          nb_episodes,
          nb_epoch_cycles,
          episode_length,
          nb_train_steps,
          eval_freq,
          save_freq,
          nb_eval_episodes,
          actor,
          critic,
          memory,
          gamma,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          action_noise,
          param_noise,
          popart,
          clip_norm,
          batch_size,
          reward_scale,
          action_repeat,
          full,
          exclude_centering_frame,
          visualize,
          fail_reward,
          num_processes,
          num_processes_to_wait,
          num_testing_processes,
          learning_session,
          min_buffer_length,
          integrator_accuracy=5e-5,
          max_env_traj=100,
          tau=0.01):
    """
    Parameters
    ----------
    nb_epochs : the number of epochs to train.

    nb_episodes : the number of episodes for each epoch.

    episode_length : the maximum number of steps for each episode.

    gamma : discount factor.

    tau : soft update coefficient.

    clip_norm : clip on the norm of the gradient.
    """

    assert action_repeat > 0
    assert nb_episodes >= num_processes

    # Get params from learning session
    checkpoint_dir = learning_session.checkpoint_dir
    log_dir = learning_session.log_dir
    training_step = learning_session.last_training_step

    # Initialize DDPG agent (target network and replay buffer)
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=None,
                 critic_l2_reg=critic_l2_reg,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 training_step=training_step)

    # We need max_action because the NN output layer is a tanh.
    # So we must scale it back.
    max_action = env.action_space.high

    # Build Workers
    events = [Event() for _ in range(num_processes)]
    inputQs = [Queue() for _ in range(num_processes)]
    outputQ = Queue()
    # Split work among workers
    nb_episodes_per_worker = nb_episodes // num_processes

    workers = [
        SamplingWorker(i, actor, critic, episode_length,
                       nb_episodes_per_worker, action_repeat, max_action,
                       gamma, tau, normalize_returns, batch_size,
                       normalize_observations, param_noise, critic_l2_reg,
                       popart, clip_norm, reward_scale, events[i], inputQs[i],
                       outputQ, full, exclude_centering_frame,
                       integrator_accuracy, max_env_traj, visualize,
                       fail_reward) for i in range(num_processes)
    ]

    # Run the Workers
    for w in workers:
        w.start()

    # Create Round Robin tester
    tester = RoundRobinTester(
        num_testing_processes, actor, critic, episode_length, nb_eval_episodes,
        action_repeat, max_action, gamma, tau, normalize_returns, batch_size,
        normalize_observations, critic_l2_reg, popart, clip_norm, reward_scale,
        full, exclude_centering_frame, integrator_accuracy, max_env_traj,
        visualize, fail_reward)

    # Start training loop
    with U.single_threaded_session() as sess:
        agent.initialize(sess)

        writer = tf.summary.FileWriter(log_dir)
        writer.add_graph(sess.graph)

        # Initialize writer and statistics
        stats = EvaluationStatistics(tf_session=sess, tf_writer=writer)

        # setup saver
        saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2)

        get_parameters = U.GetFlat(actor.trainable_vars)

        global_step = 0
        obs = env.reset()
        agent.reset()

        # Processes waiting for a new sampling task
        waiting_indices = [i for i in range(num_processes)]
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # If we have sampling workers waiting, dispatch a sampling job
                if waiting_indices:
                    actor_ws = get_parameters()
                    # Run parallel sampling
                    for i in waiting_indices:
                        inputQs[i].put(('sample', actor_ws))
                        events[i].set()  # Notify worker: sample baby, sample!
                    waiting_indices.clear()

                # Collect results when ready
                for i in range(num_processes_to_wait):
                    process_index, transitions = outputQ.get()
                    waiting_indices.append(process_index)
                    print('Collecting transition samples from Worker {}/{}'.
                          format(i + 1, num_processes_to_wait))
                    for t in transitions:
                        agent.store_transition(*t)

                # try to collect other samples if available
                for i in range(num_processes):
                    try:
                        process_index, transitions = outputQ.get_nowait()
                        if process_index not in waiting_indices:
                            waiting_indices.append(process_index)
                        print('Collecting transition samples from Worker {}'.
                              format(process_index))
                        for t in transitions:
                            agent.store_transition(*t)
                    except queue.Empty:
                        # No sampling ready, keep on training.
                        pass

                # Training phase
                if agent.memory.nb_entries > min_buffer_length:
                    for _ in range(nb_train_steps):
                        critic_loss, actor_loss = agent.train()
                        agent.update_target_net()

                        # Plot statistics
                        stats.add_critic_loss(critic_loss, global_step)
                        stats.add_actor_loss(actor_loss, global_step)
                        global_step += 1

                    # Evaluation phase
                    if cycle % eval_freq == 0:
                        print("Cycle number: ",
                              cycle + epoch * nb_epoch_cycles)
                        print("Sending testing job...")
                        actor_ws = get_parameters()

                        # Send a testing job
                        tester.test(actor_ws, global_step)

                        # Print stats (if any)
                        tester.log_stats(stats, logger)

                    if cycle % save_freq == 0:
                        # Save weights
                        save_path = saver.save(sess,
                                               checkpoint_dir,
                                               global_step=global_step)
                        print("Model saved in path: %s" % save_path)
                        # Dump learning session
                        learning_session.dump(agent.training_step)
                        print("Learning session dumped to: %s" %
                              str(learning_session.session_path))
                else:
                    print("Not enough entry in memory buffer")

        # Stop workers
        for i in range(num_processes):
            inputQs[i].put(('exit', None))
            events[i].set()  # Notify worker: exit!
        tester.close()  # Stop testing workers
        env.close()
コード例 #6
0
def learn(make_env, make_policy, *,
          n_episodes,
          horizon,
          delta,
          gamma,
          max_iters,
          sampler=None,
          use_natural_gradient=False, #can be 'exact', 'approximate'
          fisher_reg=1e-2,
          iw_method='is',
          iw_norm='none',
          bound='J',
          line_search_type='parabola',
          save_weights=0,
          improvement_tol=0.,
          center_return=False,
          render_after=None,
          max_offline_iters=100,
          callback=None,
          clipping=False,
          entropy='none',
          positive_return=False,
          reward_clustering='none'):

    np.set_printoptions(precision=3)
    max_samples = horizon * n_episodes

    if line_search_type == 'binary':
        line_search = line_search_binary
    elif line_search_type == 'parabola':
        line_search = line_search_parabola
    else:
        raise ValueError()

    # Building the environment
    env = make_env()
    ob_space = env.observation_space
    ac_space = env.action_space

    # Building the policy
    pi = make_policy('pi', ob_space, ac_space)
    oldpi = make_policy('oldpi', ob_space, ac_space)

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.split('/')[1].startswith('pol')]

    shapes = [U.intprod(var.get_shape().as_list()) for var in var_list]
    n_parameters = sum(shapes)

    # Placeholders
    ob_ = ob = U.get_placeholder_cached(name='ob')
    ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac')
    mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask')
    rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='rew')
    disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='disc_rew')
    clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(n_episodes))
    gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient')
    iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number')
    losses_with_name = []

    # Policy densities
    target_log_pdf = pi.pd.logp(ac_)
    behavioral_log_pdf = oldpi.pd.logp(ac_)
    log_ratio = target_log_pdf - behavioral_log_pdf

    # Split operations
    disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes))
    rew_split = tf.stack(tf.split(rew_ * mask_, n_episodes))
    log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes))
    target_log_pdf_split = tf.stack(tf.split(target_log_pdf * mask_, n_episodes))
    behavioral_log_pdf_split = tf.stack(tf.split(behavioral_log_pdf * mask_, n_episodes))
    mask_split = tf.stack(tf.split(mask_, n_episodes))

    # Renyi divergence
    emp_d2_split = tf.stack(tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes))
    emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1)
    empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split))

    # Return
    ep_return = clustered_rew_ #tf.reduce_sum(mask_split * disc_rew_split, axis=1)
    if clipping:
        rew_split = tf.clip_by_value(rew_split, -1, 1)

    if center_return:
        ep_return = ep_return - tf.reduce_mean(ep_return)
        rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24))

    discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma
    discounter_tf = tf.constant(discounter)
    disc_rew_split = rew_split * discounter_tf

    #tf.add_to_collection('prints', tf.Print(ep_return, [ep_return], 'ep_return_not_clustered', summarize=20))

    # Reward clustering
    '''
    rew_clustering_options = reward_clustering.split(':')
    if reward_clustering == 'none':
        pass # Do nothing
    elif rew_clustering_options[0] == 'global':
        assert len(rew_clustering_options) == 2, "Reward clustering: Provide the correct number of parameters"
        N = int(rew_clustering_options[1])
        tf.add_to_collection('prints', tf.Print(ep_return, [ep_return], 'ep_return', summarize=20))
        global_rew_min = tf.Variable(float('+inf'), trainable=False)
        global_rew_max = tf.Variable(float('-inf'), trainable=False)
        rew_min = tf.reduce_min(ep_return)
        rew_max = tf.reduce_max(ep_return)
        global_rew_min = tf.assign(global_rew_min, tf.minimum(global_rew_min, rew_min))
        global_rew_max = tf.assign(global_rew_max, tf.maximum(global_rew_max, rew_max))
        interval_size = (global_rew_max - global_rew_min) / N
        ep_return = tf.floordiv(ep_return, interval_size) * interval_size
    elif rew_clustering_options[0] == 'batch':
        assert len(rew_clustering_options) == 2, "Reward clustering: Provide the correct number of parameters"
        N = int(rew_clustering_options[1])
        rew_min = tf.reduce_min(ep_return)
        rew_max = tf.reduce_max(ep_return)
        interval_size = (rew_max - rew_min) / N
        ep_return = tf.floordiv(ep_return, interval_size) * interval_size
    elif rew_clustering_options[0] == 'manual':
        assert len(rew_clustering_options) == 4, "Reward clustering: Provide the correct number of parameters"
        N, rew_min, rew_max = map(int, rew_clustering_options[1:])
        print("N:", N)
        print("Min reward:", rew_min)
        print("Max reward:", rew_max)
        interval_size = (rew_max - rew_min) / N
        print("Interval size:", interval_size)
        # Clip to avoid overflow and cluster
        ep_return = tf.clip_by_value(ep_return, rew_min, rew_max)
        ep_return = tf.cast(tf.floordiv(ep_return, interval_size) * interval_size, tf.float32)
        tf.add_to_collection('prints', tf.Print(ep_return, [ep_return], 'ep_return_clustered', summarize=20))
    else:
        raise Exception('Unrecognized reward clustering scheme.')
    '''

    return_mean = tf.reduce_mean(ep_return)
    return_std = U.reduce_std(ep_return)
    return_max = tf.reduce_max(ep_return)
    return_min = tf.reduce_min(ep_return)
    return_abs_max = tf.reduce_max(tf.abs(ep_return))
    return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward
    return_step_mean = tf.abs(tf.reduce_mean(rew_split))
    positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split))
    negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split))
    return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max)

    losses_with_name.extend([(return_mean, 'InitialReturnMean'),
                             (return_max, 'InitialReturnMax'),
                             (return_min, 'InitialReturnMin'),
                             (return_std, 'InitialReturnStd'),
                             (empirical_d2, 'EmpiricalD2'),
                             (return_step_max, 'ReturnStepMax'),
                             (return_step_maxmin, 'ReturnStepMaxmin')])

    if iw_method == 'pdis':
        # log_ratio_split cumulative sum
        log_ratio_cumsum = tf.cumsum(log_ratio_split, axis=1)
        # Exponentiate
        ratio_cumsum = tf.exp(log_ratio_cumsum)
        # Multiply by the step-wise reward (not episode)
        ratio_reward = ratio_cumsum * disc_rew_split
        # Average on episodes
        ratio_reward_per_episode = tf.reduce_sum(ratio_reward, axis=1)
        w_return_mean = tf.reduce_sum(ratio_reward_per_episode, axis=0) / n_episodes
        # Get d2(w0:t) with mask
        d2_w_0t = tf.exp(tf.cumsum(emp_d2_split, axis=1)) * mask_split # LEAVE THIS OUTSIDE
        # Sum d2(w0:t) over timesteps
        episode_d2_0t = tf.reduce_sum(d2_w_0t, axis=1)
        # Sample variance
        J_sample_variance = (1/(n_episodes-1)) * tf.reduce_sum(tf.square(ratio_reward_per_episode - w_return_mean))
        losses_with_name.append((J_sample_variance, 'J_sample_variance'))
        losses_with_name.extend([(tf.reduce_max(ratio_cumsum), 'MaxIW'),
                                 (tf.reduce_min(ratio_cumsum), 'MinIW'),
                                 (tf.reduce_mean(ratio_cumsum), 'MeanIW'),
                                 (U.reduce_std(ratio_cumsum), 'StdIW'),
                                 (tf.reduce_mean(U.reduce_std(ratio_cumsum, axis=0)), 'StdIW_mean')])
        losses_with_name.extend([(tf.reduce_max(d2_w_0t), 'MaxD2w0t'),
                                 (tf.reduce_min(d2_w_0t), 'MinD2w0t'),
                                 (tf.reduce_mean(d2_w_0t), 'MeanD2w0t'),
                                 (U.reduce_std(d2_w_0t), 'StdD2w0t')])
        '''
        # TMP: adding also IS logging to compare
        iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1))
        iwn = iw / n_episodes
        IS_w_return_mean = tf.reduce_sum(iwn * ep_return)
        IS_J_sample_variance = (1/(n_episodes-1)) * tf.reduce_sum(tf.square(iw * ep_return - w_return_mean))
        losses_with_name.append((IS_J_sample_variance, 'IS_J_sample_variance'))
        losses_with_name.append((IS_w_return_mean, 'IS_ReturnMeanIW'))
        losses_with_name.extend([(tf.reduce_max(iwn), 'IS_MaxIWNorm'),
                                 (tf.reduce_min(iwn), 'IS_MinIWNorm'),
                                 (tf.reduce_mean(iwn), 'IS_MeanIWNorm'),
                                 (U.reduce_std(iwn), 'IS_StdIWNorm'),
                                 (tf.reduce_max(iw), 'IS_MaxIW'),
                                 (tf.reduce_min(iw), 'IS_MinIW'),
                                 (tf.reduce_mean(iw), 'IS_MeanIW'),
                                 (U.reduce_std(iw), 'IS_StdIW')])
        '''

    elif iw_method == 'is':
        iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1))
        if iw_norm == 'none':
            iwn = iw / n_episodes
            w_return_mean = tf.reduce_sum(iwn * ep_return)
            J_sample_variance = (1/(n_episodes-1)) * tf.reduce_sum(tf.square(iw * ep_return - w_return_mean))
            losses_with_name.append((J_sample_variance, 'J_sample_variance'))
        elif iw_norm == 'sn':
            iwn = iw / tf.reduce_sum(iw)
            w_return_mean = tf.reduce_sum(iwn * ep_return)
        elif iw_norm == 'regression':
            # Get optimized beta
            mean_iw = tf.reduce_mean(iw)
            beta = tf.reduce_sum((iw - mean_iw) * ep_return * iw) / (tf.reduce_sum((iw - mean_iw) ** 2) + 1e-24)
            # Get the estimator
            w_return_mean = tf.reduce_sum(ep_return * iw + beta * (iw - 1)) / n_episodes
        else:
            raise NotImplementedError()
        ess_classic = tf.linalg.norm(iw, 1) ** 2 / tf.linalg.norm(iw, 2) ** 2
        sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2)
        ess_renyi = n_episodes / empirical_d2
        losses_with_name.extend([(tf.reduce_max(iwn), 'MaxIWNorm'),
                                 (tf.reduce_min(iwn), 'MinIWNorm'),
                                 (tf.reduce_mean(iwn), 'MeanIWNorm'),
                                 (U.reduce_std(iwn), 'StdIWNorm'),
                                 (tf.reduce_max(iw), 'MaxIW'),
                                 (tf.reduce_min(iw), 'MinIW'),
                                 (tf.reduce_mean(iw), 'MeanIW'),
                                 (U.reduce_std(iw), 'StdIW'),
                                 (ess_classic, 'ESSClassic'),
                                 (ess_renyi, 'ESSRenyi')])
    elif iw_method == 'rbis':
        # Get pdfs for episodes
        target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1)
        behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdf_split, axis=1)
        # Normalize log_proba (avoid as overflows as possible)
        normalization_factor = tf.reduce_mean(tf.stack([target_log_pdf_episode, behavioral_log_pdf_episode]))
        target_norm_log_pdf_episode = target_log_pdf_episode - normalization_factor
        behavioral_norm_log_pdf_episode = behavioral_log_pdf_episode - normalization_factor
        # Exponentiate
        target_pdf_episode = tf.clip_by_value(tf.cast(tf.exp(target_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300)
        behavioral_pdf_episode = tf.clip_by_value(tf.cast(tf.exp(behavioral_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300)
        tf.add_to_collection('asserts', tf.assert_positive(target_pdf_episode, name='target_pdf_positive'))
        tf.add_to_collection('asserts', tf.assert_positive(behavioral_pdf_episode, name='behavioral_pdf_positive'))
        # Compute the merging matrix (reward-clustering) and the number of clusters
        reward_unique, reward_indexes = tf.unique(ep_return)
        episode_clustering_matrix = tf.cast(tf.one_hot(reward_indexes, n_episodes), tf.float64)
        max_index = tf.reduce_max(reward_indexes) + 1
        trajectories_per_cluster = tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index]
        tf.add_to_collection('asserts', tf.assert_positive(tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index], name='clustering_matrix'))
        # Get the clustered pdfs
        clustered_target_pdf = tf.matmul(tf.reshape(target_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index]
        clustered_behavioral_pdf = tf.matmul(tf.reshape(behavioral_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index]
        tf.add_to_collection('asserts', tf.assert_positive(clustered_target_pdf, name='clust_target_pdf_positive'))
        tf.add_to_collection('asserts', tf.assert_positive(clustered_behavioral_pdf, name='clust_behavioral_pdf_positive'))
        # Compute the J
        ratio_clustered = clustered_target_pdf / clustered_behavioral_pdf
        #ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique                                                  # ---- No cluster cardinality
        ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique * tf.cast(trajectories_per_cluster, tf.float32)   # ---- Cluster cardinality
        #w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(max_index, tf.float32)                                         # ---- No cluster cardinality
        w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(n_episodes, tf.float32)                                         # ---- Cluster cardinality
        # Divergences
        ess_classic = tf.linalg.norm(ratio_reward, 1) ** 2 / tf.linalg.norm(ratio_reward, 2) ** 2
        sqrt_ess_classic = tf.linalg.norm(ratio_reward, 1) / tf.linalg.norm(ratio_reward, 2)
        ess_renyi = n_episodes / empirical_d2
        # Summaries
        losses_with_name.extend([(tf.reduce_max(ratio_clustered), 'MaxIW'),
                                 (tf.reduce_min(ratio_clustered), 'MinIW'),
                                 (tf.reduce_mean(ratio_clustered), 'MeanIW'),
                                 (U.reduce_std(ratio_clustered), 'StdIW'),
                                 (1-(max_index / n_episodes), 'RewardCompression'),
                                 (ess_classic, 'ESSClassic'),
                                 (ess_renyi, 'ESSRenyi')])
    else:
        raise NotImplementedError()

    if bound == 'J':
        bound_ = w_return_mean
    elif bound == 'std-d2':
        bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_std
    elif bound == 'max-d2':
        var_estimate = tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_abs_max
        bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_abs_max
    elif bound == 'max-ess':
        bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_abs_max
    elif bound == 'std-ess':
        bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_std
    elif bound == 'pdis-max-d2':
        # Discount factor
        if gamma >= 1:
            discounter = [float(1+2*(horizon-t-1)) for t in range(0, horizon)]
        else:
            def f(t):
                return pow(gamma, 2*t) + (2*pow(gamma,t)*(pow(gamma, t+1) - pow(gamma, horizon))) / (1-gamma)
            discounter = [f(t) for t in range(0, horizon)]
        discounter_tf = tf.constant(discounter)
        mean_episode_d2 = tf.reduce_sum(d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24)
        discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2
        discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time
        bound_ = w_return_mean - tf.sqrt((1-delta) * discounted_total_d2 / (delta*n_episodes)) * return_step_max
    elif bound == 'pdis-mean-d2':
        # Discount factor
        if gamma >= 1:
            discounter = [float(1+2*(horizon-t-1)) for t in range(0, horizon)]
        else:
            def f(t):
                return pow(gamma, 2*t) + (2*pow(gamma,t)*(pow(gamma, t+1) - pow(gamma, horizon))) / (1-gamma)
            discounter = [f(t) for t in range(0, horizon)]
        discounter_tf = tf.constant(discounter)
        mean_episode_d2 = tf.reduce_sum(d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24)
        discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2
        discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time
        bound_ = w_return_mean - tf.sqrt((1-delta) * discounted_total_d2 / (delta*n_episodes)) * return_step_mean
    else:
        raise NotImplementedError()

    # Policy entropy for exploration
    ent = pi.pd.entropy()
    meanent = tf.reduce_mean(ent)
    losses_with_name.append((meanent, 'MeanEntropy'))
    # Add policy entropy bonus
    if entropy != 'none':
        scheme, v1, v2 = entropy.split(':')
        if scheme == 'step':
            entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0))
            losses_with_name.append((entcoeff, 'EntropyCoefficient'))
            entbonus = entcoeff * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'lin':
            ip = tf.cast(iter_number_ / max_iters, tf.float32)
            entcoeff_decay = tf.maximum(0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip))
            losses_with_name.append((entcoeff_decay, 'EntropyCoefficient'))
            entbonus = entcoeff_decay * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'exp':
            ent_f = tf.exp(-tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1)
            losses_with_name.append((ent_f, 'EntropyCoefficient'))
            bound_ = bound_ + ent_f * meanent
        else:
            raise Exception('Unrecognized entropy scheme.')

    losses_with_name.append((w_return_mean, 'ReturnMeanIW'))
    losses_with_name.append((bound_, 'Bound'))
    losses, loss_names = map(list, zip(*losses_with_name))

    if use_natural_gradient:
        p = tf.placeholder(dtype=tf.float32, shape=[None])
        target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1)
        grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list)
        dot_product = tf.reduce_sum(grad_logprob * p)
        hess_logprob = U.flatgrad(dot_product, var_list)
        compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob])

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])

    assert_ops = tf.group(*tf.get_collection('asserts'))
    print_ops = tf.group(*tf.get_collection('prints'))

    compute_lossandgrad = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_grad = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_bound = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], [bound_, assert_ops, print_ops])
    compute_losses = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_], losses)
    #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, mask_], [ratio_cumsum, discounted_ratio])

    set_parameter = U.SetFromFlat(var_list)
    get_parameter = U.GetFlat(var_list)

    if sampler is None:
        seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True)
        sampler = type("SequentialSampler", (object,), {"collect": lambda self, _: seg_gen.__next__()})()

    U.initialize()

    # Starting optimizing

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=n_episodes)
    rewbuffer = deque(maxlen=n_episodes)

    while True:

        iters_so_far += 1

        if render_after is not None and iters_so_far % render_after == 0:
            if hasattr(env, 'render'):
                render(env, pi, horizon)

        if callback:
            callback(locals(), globals())

        if iters_so_far >= max_iters:
            print('Finised...')
            break

        logger.log('********** Iteration %i ************' % iters_so_far)

        theta = get_parameter()

        with timed('sampling'):
            seg = sampler.collect(theta)

        add_disc_rew(seg, gamma)

        lens, rets = seg['ep_lens'], seg['ep_rets']
        lenbuffer.extend(lens)
        rewbuffer.extend(rets)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)

        #print('------')
        #print(np.reshape(seg['ob'], (n_episodes, horizon, -1))[:,:,0])
        #print(np.reshape(seg['mask'], (n_episodes, horizon)))


        # Get clustered reward
        reward_matrix = np.reshape(seg['disc_rew'] * seg['mask'], (n_episodes, horizon))
        ep_reward = np.sum(reward_matrix, axis=1)
        ep_reward = cluster_rewards(ep_reward, reward_clustering)

        args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number = seg['ob'], seg['ac'], seg['rew'], seg['disc_rew'], ep_reward, seg['mask'], iters_so_far

        assign_old_eq_new()

        def evaluate_loss():
            loss = compute_bound(*args)
            return loss[0]

        def evaluate_gradient():
            gradient = compute_grad(*args)
            return gradient[0]

        if use_natural_gradient:
            def evaluate_fisher_vector_prod(x):
                return compute_linear_operator(x, *args)[0] + fisher_reg * x

            def evaluate_natural_gradient(g):
                return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0)
        else:
            evaluate_natural_gradient = None

        with timed('summaries before'):
            logger.record_tabular("Iteration", iters_so_far)
            logger.record_tabular("InitialBound", evaluate_loss())
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)

        if save_weights > 0 and iters_so_far % save_weights == 0:
            logger.record_tabular('Weights', str(get_parameter()))
            import pickle
            file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb')
            pickle.dump(theta, file)

        with timed("offline optimization"):
            theta, improvement = optimize_offline(theta,
                                                  set_parameter,
                                                  line_search,
                                                  evaluate_loss,
                                                  evaluate_gradient,
                                                  evaluate_natural_gradient,
                                                  max_offline_ite=max_offline_iters)

        set_parameter(theta)

        with timed('summaries after'):
            meanlosses = np.array(compute_losses(*args))
            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

        logger.dump_tabular()

    env.close()
コード例 #7
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          lr=0.03,
          momentum=0.9):
    ob_dim, ac_dim = policy.ob_dim, policy.ac_dim
    dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp')
    oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe')
    dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi')
    # with tf.variable_scope('dbp'):
    # with tf.variable_scope('oe'):
    # with tf.variable_scope('doi'):

    pi = policy

    do_std = U.function([], [pi.std_1a, pi.logstd_1a])

    kloldnew = oldpi.pd.kl(pi.pd)
    dbkloldnew = dboldpi.pd.kl(dbpi.pd)
    dist = meankl = tf.reduce_mean(kloldnew)
    dbkl = tf.reduce_mean(dbkloldnew)
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(lr)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info

    var_list = [v for v in tf.global_variables() if "pi" in v.name]
    db_var_list = [v for v in tf.global_variables() if "dbp" in v.name]
    old_var_list = [v for v in tf.global_variables() if "oe" in v.name]
    db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name]
    print(len(var_list), len(db_var_list), len(old_var_list),
          len(db_old_var_list))
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(old_var_list, var_list)
        ])
    assign_db = U.function(
        [], [],
        updates=[
            tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list)
        ] + [
            tf.assign(dbold, dbnew)
            for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)
        ])

    assign_old_eq_newr = U.function(
        [], [],
        updates=[
            tf.assign(newv, oldv)
            for (oldv, newv) in zipsame(old_var_list, var_list)
        ])
    # assign_dbr = U.function([], [], updates=
    # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] +
    # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)])

    klgrads = tf.gradients(dist, var_list)
    dbklgrads = tf.gradients(dbkl, db_var_list)
    p_grads = [tf.ones_like(v) for v in dbklgrads]

    get_flat = U.GetFlat(var_list)
    get_old_flat = U.GetFlat(old_var_list)
    set_from_flat = U.SetFromFlat(var_list)

    flat_tangent2 = tf.placeholder(dtype=tf.float32,
                                   shape=[None],
                                   name="flat_tan2")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents2 = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape))
        start += sz
    gvp2 = tf.add_n([
        tf.reduce_sum(g * tangent2)
        for (g, tangent2) in zipsame(dbklgrads, tangents2)
    ])
    gvp2_grads = tf.gradients(gvp2, db_var_list)

    neg_term = tf.add_n([
        tf.reduce_sum(g * tangent2)
        for (g, tangent2) in zipsame(gvp2_grads, tangents2)
    ]) / 2.
    ng1 = tf.gradients(neg_term, db_var_list)
    ng2 = tf.gradients(neg_term, db_old_var_list)

    neg_term_grads = [
        a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list),
                                tf.gradients(neg_term, db_old_var_list))
    ]
    neg_term = neg_term_grads
    # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads])

    pos_term = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(gvp2_grads, p_grads)
    ])
    pos_term_grads = [
        a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list),
                                tf.gradients(pos_term, db_old_var_list))
    ]
    pos_term_sum = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(pos_term_grads, tangents2)
    ])
    pos_term_grads = tf.gradients(pos_term_sum, p_grads)
    pos_term = pos_term_grads
    # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads])
    geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)]

    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    grads = optim.compute_gradients(loss, var_list=pi_var_list)
    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)]
    geo_grads = list(zip(geo_term, var_list))
    update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads)
    do_update = U.function(inputs, update_op)
    inputs_tangent = list(inputs) + [flat_tangent2]
    do_update_geo = U.function(inputs_tangent, update_geo_op)
    do_get_geo_term = U.function(inputs_tangent, [ng1, ng2])
    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner, q_runner_geo]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        assign_old_eq_new()  # set old parameter values to new parameter values
        assign_db()

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)
        # ft2 = get_flat() - get_old_flat()

        # assign_old_eq_newr() # assign back
        # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2)

        # def check_nan(bs):
        #     return [~np.isnan(b).all() for b in bs]

        # print(gnp[0])
        # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf')
        # print(gnp[1])
        # do_update_geo(ob_no, action_na, standardized_adv_n, ft2)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        # if kl > desired_kl * 2:
        #     logger.log("kl too high")
        #     tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
        # elif kl < desired_kl / 2:
        #     logger.log("kl too low")
        #     tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
        # else:
        #     logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        print(do_std())
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #8
0
    def train(self, samples_data, normalize_rewards=False):

        rewards = samples_data["rewards"]
        reward_list = samples_data["reward_list"]
        actions = samples_data["actions"]
        timesteps = samples_data["timesteps"]
        actions_one_hot = samples_data["actions_one_hot"]
        wins = samples_data.get("wins", 0)
        observations = samples_data["observations"]
        obs_flat_and_padded = flat_and_pad(
            samples_data["paths_full"]["states"])
        next_states_flat_and_padded = flat_and_pad(
            samples_data["paths_full"]["next_states_centred"])

        # preprocess rewards
        if normalize_rewards:
            mean_rew = np.mean(rewards)
            std_rew = np.maximum(np.std(rewards), 1e-5)
            for i in range(len(samples_data["paths_full"]["rewards"])):
                samples_data["paths_full"]["rewards"][i] = (
                    samples_data["paths_full"]["rewards"][i] -
                    mean_rew) / std_rew

        rew_flat_and_padded = flat_and_pad(
            samples_data["paths_full"]["rewards"])
        mask = rew_flat_and_padded != 0
        actions_one_hot_flat_and_padded = flat_and_pad(
            samples_data["paths_full"]["actions_one_hot"])
        actions = np.zeros((obs_flat_and_padded.shape[0], 1))
        actions = np.hstack((actions - 1, actions + 1))

        omega_before = self.sess.run(self.model.get_omega())
        variables_before = U.GetFlat(self.policy.trainable_vars)()

        inputs_dict = {
            self.rewards_ph: rew_flat_and_padded,
            self.actions_one_hot_ph: actions_one_hot_flat_and_padded,
            self.observations_ph: obs_flat_and_padded,
            self.next_states_ph: next_states_flat_and_padded,
            self.actions_ph: actions,
            self.returns_ph: reward_list,
            self.timesteps_ph: timesteps,
            self.mask: mask,
        }

        inputs_dict.update(self.model.get_feed_dict())

        _, summary_str, ac_prob, model_prob, log_prob = self.sess.run(
            [
                self.train_op,
                self.summarize,
                self.policy_tf,
                self.model_tf,
                self.log_prob,
            ],
            feed_dict=inputs_dict,
        )
        self.global_step += 1
        self.summary_writer.add_summary(summary_str, self.global_step)

        omega = self.sess.run(self.model.get_omega())
        variables_after = U.GetFlat(self.policy.trainable_vars)()

        delta_variables = variables_after - variables_before
        norm_delta_var = np.linalg.norm(delta_variables)
        delta_omega = omega - omega_before
        norm_delta_omega = np.linalg.norm(delta_omega)
        # theta = self.sess.run(self.policy.getTheta())
        # record all
        logger.record_tabular("ITERATIONS", self.iteration)
        # logger.record_tabular("Theta", theta)
        logger.record_tabular("OmegaBefore", omega_before)
        logger.record_tabular("Omega", omega)
        logger.record_tabular("NormDeltaOmega", norm_delta_omega)
        logger.record_tabular("NormDeltaVar", norm_delta_var)
        logger.record_tabular("DeltaOmega", delta_omega)
        logger.record_tabular("ReturnsMean", np.mean(reward_list))
        logger.record_tabular("ReturnsStd", np.std(reward_list))
        logger.record_tabular("RewardMean", np.mean(rewards))
        logger.record_tabular("RewardStd", np.std(rewards))
        logger.record_tabular("TimestepsMean", np.mean(timesteps))
        logger.record_tabular("TimestepsStd", np.std(timesteps))
        logger.record_tabular("Wins", wins)
        logger.record_tabular("Traj", samples_data["traj"])
        logger.record_tabular("ConfortViolation",
                              samples_data["confort_violation"])
        logger.dump_tabular()
        self.iteration += 1
        return omega
コード例 #9
0
def learn(make_env,
          make_policy,
          horizon,
          gamma=0.99,
          max_iters=1000,
          filename=None,
          grid_size=100,
          feature_fun=None,
          plot_bound=False):

    # Build the environment
    env = make_env()
    ob_space = env.observation_space
    ac_space = env.action_space

    # Build the higher level policy
    pi = make_policy('pi', ob_space, ac_space)

    # Get all pi's learnable parameters
    all_var_list = pi.get_trainable_variables()
    var_list = \
        [v for v in all_var_list if v.name.split('/')[1].startswith('higher')]

    # TF functions
    set_parameters = U.SetFromFlat(var_list)
    get_parameters = U.GetFlat(var_list)

    # Generate the grid of parameters to evaluate
    gain_grid = np.linspace(-1, 1, grid_size)
    rho = get_parameters()
    std_too = (len(rho) == 2)
    if std_too:
        grid_size_std = int(grid_size)
        logstd_grid = np.linspace(-4, 0, grid_size_std)
        x, y = np.meshgrid(gain_grid, logstd_grid)
        X = x.reshape((np.prod(x.shape),))
        Y = y.reshape((np.prod(y.shape),))
        rho_grid = list(zip(X, Y))
    else:
        rho_grid = [[x] for x in gain_grid]

    # initialize loop variables
    n_selections = np.zeros(len(rho_grid))
    ret_sums = np.zeros(len(rho_grid))
    regret = 0
    iter = 0

    # Learning loop
    tstart = time.time()
    while True:
        iter += 1

        # Exit loop in the end
        if iter - 1 >= max_iters:
            print('Finished...')
            break

        # Learning iteration
        logger.log('********** Iteration %i ************' % iter)

        ub = []
        ub_best = 0
        i_best = 0
        average_ret = []
        bonus = []
        for i, rho in enumerate(rho_grid):
            if n_selections[i] > 0:
                average_ret_rho = ret_sums[i] / n_selections[i]
                bonus_rho = np.sqrt(2 * np.log(iter) / n_selections[i])
                ub_rho = average_ret_rho + bonus_rho
                ub.append(ub_rho)
                if not std_too:
                    average_ret.append(average_ret_rho)
                    bonus.append(bonus_rho)
            else:
                ub_rho = 1e100
                ub.append(ub_rho)
                average_ret.append(0)
                bonus.append(1e100)
            if ub_rho > ub_best:
                ub_best = ub_rho
                rho_best = rho
                i_best = i
        # Sample actor's parameters from chosen arm
        set_parameters(rho_best)
        _ = pi.resample()

        # Sample a trajectory with the newly parametrized actor
        _, disc_ret, _ = eval_trajectory(
            env, pi, gamma, horizon, feature_fun)
        ret_sums[i_best] += disc_ret
        regret += (0.96512 - disc_ret)
        n_selections[i_best] += 1

        # Store info about variables of interest
        if env.spec.id == 'LQG1D-v0':
            mu1_actor = pi.eval_actor_mean([[1]])[0][0]
            mu1_higher = pi.eval_higher_mean()[0]
            sigma = pi.eval_higher_std()[0]
            logger.record_tabular("LQGmu1_actor", mu1_actor)
            logger.record_tabular("LQGmu1_higher", mu1_higher)
            logger.record_tabular("LQGsigma_higher", sigma)
        logger.record_tabular("ReturnLastEpisode", disc_ret)
        logger.record_tabular("ReturnMean", sum(ret_sums) / iter)
        logger.record_tabular("Regret", regret)
        logger.record_tabular("Regret/t", regret / iter)
        logger.record_tabular("Iteration", iter)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        # Plot the profile of the bound and its components
        if plot_bound:
            if std_too:
                ub = np.array(ub).reshape((grid_size_std, grid_size))
                plot3D_bound_profile(x, y, ub, rho_best, ub_best,
                                     iter, filename)
            else:
                plot_bound_profile(gain_grid, ub, average_ret, bonus, rho_best,
                                   ub_best, iter, filename)
        # Print all info in a table
        logger.dump_tabular()

    # Close environment in the end
    env.close()
コード例 #10
0
def learn(
        env,
        test_env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        # CMAES
    max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        max_v_train_iter,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        # time constraint
        callback=None,
        # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        # annealing for stepsize parameters (epsilon and adam)
        seed,
        env_id):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution

    reward = tf.placeholder(dtype=tf.float32, shape=[None])  # step rewards
    pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    bound_coeff = tf.placeholder(
        name='bound_coeff', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    next_ob = U.get_placeholder_cached(
        name="next_ob")  # next step observation for updating q function
    ac = U.get_placeholder_cached(
        name="act")  # action placeholder for computing q function
    mean_ac = U.get_placeholder_cached(
        name="mean_act")  # action placeholder for computing q function

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    pi_adv = pi.qpred - pi.vpred
    adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0])
    normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var)

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * normalized_pi_adv  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * normalized_pi_adv  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    qf_loss = tf.reduce_mean(
        tf.square(reward + gamma * pi.mean_qpred - pi.qpred))

    # qf_loss = tf.reduce_mean(U.huber_loss(pi.qpred - tf.stop_gradient(reward + gamma * pi.mean_qpred)))
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    qf_losses = [qf_loss]
    vf_losses = [vf_loss]

    # pol_loss = -tf.reduce_mean(pi_adv)
    pol_loss = pol_surr + pol_entpen
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
    ]
    qf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("qf")
    ]
    mean_qf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("meanqf")
    ]

    vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                                losses + [U.flatgrad(vf_loss, vf_var_list)])
    qf_lossandgrad = U.function(
        [ob, ac, next_ob, mean_ac, lrmult, reward, atarg],
        qf_losses + [U.flatgrad(qf_loss, qf_var_list)])

    qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon)

    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)

    assign_target_q_eq_eval_q = U.function(
        [], [],
        updates=[
            tf.assign(target_q, eval_q)
            for (target_q, eval_q) in zipsame(mean_qf_var_list, qf_var_list)
        ])

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])
    # Compute all losses

    mean_pi_actions = U.function([ob], [pi.pd.mode()])
    compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult],
                                    [pol_loss, pol_surr, pol_entpen, meankl])

    U.initialize()

    get_pi_flat_params = U.GetFlat(pol_var_list)
    set_pi_flat_params = U.SetFromFlat(pol_var_list)

    vf_adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness

    episodes_so_far = 0
    timesteps_so_far = 0
    ppo_timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    eval_gen = traj_segment_generator_eval(pi,
                                           test_env,
                                           timesteps_per_actorbatch,
                                           stochastic=True)  # For evaluation
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     eval_gen=eval_gen)  # For train V Func

    # Build generator for all solutions
    actors = []
    best_fitness = 0
    for i in range(popsize):
        newActor = traj_segment_generator(pi,
                                          env,
                                          timesteps_per_actorbatch,
                                          stochastic=True,
                                          eval_gen=eval_gen)
        actors.append(newActor)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            print("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            print("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            print("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            print("Max time")
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        # Generate new samples
        # Train V func
        for i in range(max_v_train_iter):
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train iter for V func:" + str(i))
            logger.log("Generate New Samples")
            seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)

            ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg[
                "rew"], seg["tdlamret"], \
                                                                 seg["traj_index"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(
                    ob)  # update running mean/std for normalization

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            # Train V function
            logger.log("Training V Func and Evaluating V Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                   batch["atarg"],
                                                   batch["vtarg"], cur_lrmult)
                    vf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(vf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            d_q = Dataset(dict(ob=ob,
                               ac=ac,
                               next_ob=next_ob,
                               reward=reward,
                               atarg=atarg,
                               vtarg=tdlamret),
                          shuffle=not pi.recurrent)

            # Re-train q function
            logger.log("Training Q Func Evaluating Q Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d_q.iterate_once(optim_batchsize):
                    *qf_losses, g = qf_lossandgrad(
                        batch["ob"], batch["ac"], batch["next_ob"],
                        mean_pi_actions(batch["ob"])[0], cur_lrmult,
                        batch["reward"], batch["atarg"])
                    qf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(qf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            assign_target_q_eq_eval_q()

        # CMAES Train Policy
        assign_old_eq_new()  # set old parameter values to new parameter values
        assign_backup_eq_new()  # backup current policy
        flatten_weights = get_pi_flat_params()
        opt = cma.CMAOptions()
        opt['tolfun'] = max_fitness
        opt['popsize'] = popsize
        opt['maxiter'] = gensize
        opt['verb_disp'] = 0
        opt['verb_log'] = 0
        opt['seed'] = seed
        opt['AdaptSigma'] = True
        es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt)
        while True:
            if es.countiter >= gensize:
                logger.log("Max generations for current layer")
                break
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train Generation for Policy:" +
                       str(es.countiter))
            logger.log("Sigma=" + str(es.sigma))
            solutions = es.ask()
            costs = []
            lens = []

            assign_backup_eq_new()  # backup current policy

            for id, solution in enumerate(solutions):
                set_pi_flat_params(solution)
                losses = []
                cost = compute_pol_losses(ob, ac, atarg, tdlamret, cur_lrmult)
                costs.append(cost[0])
                assign_new_eq_backup()
            # Weights decay
            l2_decay = compute_weight_decay(0.99, solutions)
            costs += l2_decay
            costs, real_costs = fitness_rank(costs)
            es.tell_real_seg(solutions=solutions,
                             function_values=costs,
                             real_f=real_costs,
                             segs=None)
            best_solution = es.result[0]
            best_fitness = es.result[1]
            logger.log("Best Solution Fitness:" + str(best_fitness))
            set_pi_flat_params(best_solution)

        iters_so_far += 1
        episodes_so_far += sum(lens)
コード例 #11
0
ファイル: trpo_mpi.py プロジェクト: stjordanis/RED-1
def learn(env,
          policy_func,
          reward_giver,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          ckpt_dir,
          timesteps_per_batch,
          task_name,
          gamma,
          lam,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=3e-4,
          vf_iters=3,
          max_timesteps=0,
          max_episodes=0,
          max_iters=0,
          rnd_iter=200,
          callback=None,
          dyn_norm=False,
          mmd=False):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list
        if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")
    ]
    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = pi.vlossandgrad

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_variables(pretrained_weight, variables=pi.get_variables())
    else:
        if not dyn_norm:
            pi.ob_rms.update(expert_dataset[0])

    if not mmd:
        reward_giver.train(*expert_dataset, iter=rnd_iter)
        #inspect the reward learned
        # for batch in iterbatches(expert_dataset, batch_size=32):
        #     print(reward_giver.get_reward(*batch))

    best = -2000
    save_ind = 0
    max_save = 3
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        # logger.log("Optimizing Policy...")
        for _ in range(g_step):
            seg = seg_gen.__next__()

            #mmd reward
            if mmd:
                reward_giver.set_b2(seg["ob"], seg["ac"])
                seg["rew"] = reward_giver.get_reward(seg["ob"], seg["ac"])

            #report stats and save policy if any good
            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                       )  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
            true_rewbuffer.extend(true_rets)
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)

            true_rew_avg = np.mean(true_rewbuffer)
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpTrueRewMean", true_rew_avg)
            logger.record_tabular("EpThisIter", len(lens))
            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1

            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            logger.record_tabular("Best so far", best)

            # Save model
            if ckpt_dir is not None and true_rew_avg >= best:
                best = true_rew_avg
                fname = os.path.join(ckpt_dir, task_name)
                os.makedirs(os.path.dirname(fname), exist_ok=True)
                pi.save_policy(fname + "_" + str(save_ind))
                save_ind = (save_ind + 1) % max_save

            #compute gradient towards next policy
            add_vtarg_and_adv(seg, gamma, lam)
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms") and dyn_norm:
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=False)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            if pi.use_popart:
                pi.update_popart(tdlamret)
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=128):
                    if hasattr(pi, "ob_rms") and dyn_norm:
                        pi.ob_rms.update(
                            mbob)  # update running mean/std for policy
                    vfadam.update(allmean(compute_vflossandgrad(mbob, mbret)),
                                  vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        if rank == 0:
            logger.dump_tabular()
コード例 #12
0
def learn(env,
          eval_env,
          policy_func,
          reward_giver,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          evaluation_freq,
          task_name,
          gamma,
          lam,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=3e-4,
          vf_iters=3,
          num_epochs=1000,
          callback=None):

    # configure log
    logger.configure(dir=log_dir)

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list
        if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")
    ]
    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    # assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    for epoch in range(num_epochs):
        if callback: callback(locals(), globals())

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Epoch %i ************" % epoch)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        total_obs = []
        total_acs = []
        total_ep_rets = []
        total_ep_lens = []
        total_ep_true_rets = []
        for g_step_num in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()

            # Add seg into total_seg
            total_obs.append(seg["ob"])
            total_acs.append(seg["ac"])
            total_ep_rets.append(seg["ep_rets"])
            total_ep_lens.append(seg["ep_lens"])
            total_ep_true_rets.append(seg["ep_true_rets"])

            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before update
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

            # Evaluate current policy
            if (g_step * epoch + g_step_num) % evaluation_freq == 0:
                evaluate_policy(pi, reward_giver, eval_env,
                                g_step * epoch + g_step_num,
                                timesteps_per_batch, tstart)

        #  ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        total_obs = np.vstack(total_obs)
        total_acs = np.vstack(total_acs)
        total_ep_rets = np.concatenate(total_ep_rets)
        total_ep_lens = np.concatenate(total_ep_lens)
        total_ep_true_rets = np.concatenate(total_ep_true_rets)

        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(total_obs))
        batch_size = len(total_obs) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches(
            (total_obs, total_acs),
                include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # Update running mean/std for reward_giver
            if hasattr(reward_giver, "obs_rms"):
                reward_giver.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch,
                                                     ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
コード例 #13
0
def learn(env, last_ob, last_jpos, run_reach, policy_func, reward_giver, expert_dataset, rank,
          pretrained, pretrained_weight, *,
          g_step, d_step, entcoeff, save_per_iter,
          ckpt_dir, log_dir, timesteps_per_batch, task_name,
          gamma, lam,
          max_kl, cg_iters, cg_damping=1e-2,
          vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3,
          max_timesteps=0, max_episodes=0, max_iters=0,
          callback=None
          ):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    

    pi = policy_func("pi_grasp", ob_space, ac_space, reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])


    # Changes are made in order to use tensorboard
    # -------------------------------------------
    #train_writer = tf.compat.v1.summary.FileWriter('../../logs/trpo_mpi') # sets log dir to GailPart folder

    #sess = tf.compat.v1.Session() # create a session??

    # -------------------------------------------

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.startswith("pi_grasp/pol") or v.name.startswith("pi_grasp/logstd")]
    vf_var_list = [v for v in all_var_list if v.name.startswith("pi_grasp/vff")]
    assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([], [], updates=[tf.compat.v1.assign(oldv, newv)
                                                    for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    
        
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, last_ob, last_jpos, run_reach, policy_func, env, reward_giver, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            t_name = task_name + "_" + str(iters_so_far)
            fname = os.path.join(ckpt_dir, t_name) # changed from task_name
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.compat.v1.train.Saver()
            saver.save(tf.compat.v1.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
                #print("trpo_mpi, seg = seg_gen.__next__() call output: ", seg )
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
            vpredbefore = seg["vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new()  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5*stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))

                    #logger.log("trpo_mpi.py, what should be logged with loss names ie. meanlosses:_", meanlosses)

                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                    assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
                                                             include_final_partial_batch=False, batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses

        #logger.log("trpo_mpi.py, mean losses before logging wiht loss names: \n")
        #logger.log(meanlosses)


        # This is where the nan values are tabulated for some of the entries
        #logger.log("trpo_mpi.py, view whats being printed with (loss_names, lossvalues)")
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = []  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches((ob, ac),
                                                      include_final_partial_batch=False,
                                                      batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # update running mean/std for reward_giver
            if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        
        # This is to see what the d_losses are
        #logger.log("trpo_mpi.py, see what is being logged in d_losses")
        #logger.log("trpo_mpi.py, d_losses")
        #logger.log(d_losses)
        
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        # For Tensorboard Logging
        # ---------------------------
        #tf.compat.v1.summary.scalar("Generator Accuracy", tf.convert_to_tensor( np.mean(d_losses, axis=0)[4] )  ) # 5 position
        #tf.compat.v1.summary.scalar("Expert Accuracy", tf.convert_to_tensor( np.mean(d_losses, axis=0)[5] ) ) # 6 position
        #tf.compat.v1.summary.scalar("Entropy Loss", tf.convert_to_tensor( np.mean(d_losses, axis=0)[3] )  ) # 4 position

        #merge = tf.compat.v1.summary.merge_all() # merge summaries
        #summary = sess.run([merge])

        #train_writer.add_summary(summary, iters_so_far)

        # Is there a need to reset metric after every epoch? I dont think so?
        



        # ---------------------------

        
        #logger.log("trpo_mpi.py, after logging, but before recordeing timesteps so far")

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"])  # local values, truly confirmed is empty after call
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        # Could it be that the seg locals for lens and rets are ommitted since has no use in gail algorithm?

        # Probably dont have to worry about it, check the scalar part
        logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # This has nan values
        logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # This has nan values
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) # This has nan values
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        #timesteps_so_far += sum(lens)

        timesteps_so_far += seg["steps"] # changed to match setup with no finishing condition
        iters_so_far += 1


        #env.reset() #reset the environment after a new iteration, therefore in traj generator check ob

        logger.record_tabular("EpisodesSoFar", episodes_so_far) # This is 0 ? if lens which is the number of entries for episode length doesnt exist, doesnt make sense for it to have a return.
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        # I think the entloss, entrpoy, ev_.... and the useful ones arent from the environment called using the trpo

        if rank == 0:
            logger.dump_tabular()
コード例 #14
0
def render_evaluate(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n(
        [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    # set up saver
    sess = tf.get_default_session()
    saver = tf.train.Saver()

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    print("loading pretrained model")
    saver.restore(sess, callback.model_dir)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    if True:
        for _ in range(50):
            done = False
            ob = env.reset()
            env.render()
            stochastic = 1
            while not done:
                ac, vpred = pi.act(stochastic, ob)
                ob, rew, done, _ = env.step(ac)
                env.render()

        if rank == 0:
            logger.dump_tabular()

            if callback is not None:
                callback(locals(), globals())
コード例 #15
0
 def get_Flat_variables(self):
     # weights = [v for v in self.get_trainable_variables()]
     return U.GetFlat(self.get_trainable_variables())
コード例 #16
0
def learn(
        *,
        network,
        env,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.002,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.00,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        num_reward=1,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    set_global_seeds(seed)
    # 创建policy
    policy = build_policy(env,
                          network,
                          value_network='copy',
                          num_reward=num_reward,
                          **network_kwargs)

    process_dir = logger.get_dir()
    save_dir = process_dir.split(
        'Data')[-2] + 'log/l2/seed' + process_dir[-1] + '/'
    os.makedirs(save_dir, exist_ok=True)
    coe_save = []
    impro_save = []
    grad_save = []
    adj_save = []
    coe = np.ones((num_reward)) / num_reward

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    #################################################################
    # ob ac ret atarg 都是 placeholder
    # ret atarg 此处应该是向量形式
    ob = observation_placeholder(ob_space)

    # 创建pi和oldpi
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    # 每个reward都可以算一个atarg
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32,
                         shape=[None, num_reward])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    #此处的KL div和entropy与reward无关
    ##################################
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    # entbonus 是entropy loss
    entbonus = ent_coef * meanent
    #################################

    ###########################################################
    # vferr 用来更新 v 网络
    vferr = tf.reduce_mean(tf.square(pi.vf - ret))
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))
    # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    # optimgain 用来更新 policy 网络, 应该每个reward有一个
    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    ###########################################################
    dist = meankl

    # 定义要优化的变量和 V 网络 adam 优化器
    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    # 把变量展开成一个向量的类
    get_flat = U.GetFlat(var_list)

    # 这个类可以把一个向量分片赋值给var_list里的变量
    set_from_flat = U.SetFromFlat(var_list)
    # kl散度的梯度
    klgrads = tf.gradients(dist, var_list)

    ####################################################################
    # 拉直的向量
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")

    # 把拉直的向量重新分成很多向量
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    ####################################################################

    ####################################################################
    # 把kl散度梯度与变量乘积相加
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    # 把gvp的梯度展成向量
    fvp = U.flatgrad(gvp, var_list)
    ####################################################################

    # 用学习后的策略更新old策略
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    # 计算loss
    compute_losses = U.function([ob, ac, atarg], losses)
    # 计算loss和梯度
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    # 计算fvp
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    # 计算值网络的梯度
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    # 初始化variable
    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    # 得到初始化的参数向量
    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    # 把向量the_init的值分片赋值给var_list
    set_from_flat(th_init)

    #同步
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------

    # 这是一个生成数据的迭代器
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_reward=num_reward)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()

    # 双端队列
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()

        # 计算累积回报
        add_vtarg_and_adv(seg, gamma, lam, num_reward=num_reward)
        ###########$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ToDo
        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))

        # ob, ac, atarg, tdlamret 的类型都是ndarray
        #ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        _, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        #print(seg['ob'].shape,type(seg['ob']))
        #print(seg['ac'],type(seg['ac']))
        #print(seg['adv'],type(seg['adv']))
        #print(seg["tdlamret"].shape,type(seg['tdlamret']))
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        # 标准化
        #print("============================== atarg =========================================================")
        #print(atarg)
        atarg = (atarg - np.mean(atarg, axis=0)) / np.std(
            atarg, axis=0)  # standardized advantage function estimate
        #atarg = (atarg) / np.max(np.abs(atarg),axis=0)
        #print('======================================= standardized atarg ====================================')
        #print(atarg)
        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        ## set old parameter values to new parameter values
        assign_old_eq_new()

        G = None
        S = None
        mr_lossbefore = np.zeros((num_reward, len(loss_names)))
        grad_norm = np.zeros((num_reward + 1))
        for i in range(num_reward):
            args = seg["ob"], seg["ac"], atarg[:, i]
            #print(atarg[:,i])
            # 算是args的一个sample,每隔5个取出一个
            fvpargs = [arr[::5] for arr in args]

            # 这个函数计算fisher matrix 与向量 p 的 乘积
            def fisher_vector_product(p):
                return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

            with timed("computegrad of " + str(i + 1) + ".th reward"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            mr_lossbefore[i] = lossbefore
            g = allmean(g)
            #print("***************************************************************")
            #print(g)
            if isinstance(G, np.ndarray):
                G = np.vstack((G, g))
            else:
                G = g

            # g是目标函数的梯度
            # 利用共轭梯度获得更新方向
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg of " + str(i + 1) + ".th reward"):
                    # stepdir 是更新方向
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                    shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                    lm = np.sqrt(shs / max_kl)
                    # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                    fullstep = stepdir / lm
                    grad_norm[i] = np.linalg.norm(fullstep)
                assert np.isfinite(stepdir).all()
                if isinstance(S, np.ndarray):
                    S = np.vstack((S, stepdir))
                else:
                    S = stepdir
        #print('======================================= G ====================================')
        #print(G)
        #print('======================================= S ====================================')
        #print(S)
        try:
            new_coe = get_coefficient(G, S)
            #coe = 0.99 * coe + 0.01 * new_coe
            coe = new_coe
            coe_save.append(coe)
            #根据梯度的夹角调整参数
            # GG = np.dot(S, S.T)
            # D = np.sqrt(np.diag(1/np.diag(GG)))
            # GG = np.dot(np.dot(D,GG),D)
            # #print('======================================= inner product ====================================')
            # #print(GG)
            # adj = np.sum(GG) / (num_reward ** 2)
            adj = 1
            #print('======================================= adj ====================================')
            #print(adj)
            adj_save.append(adj)
            adj_max_kl = adj * max_kl
            #################################################################
            grad_norm = grad_norm * np.sqrt(adj)
            stepdir = np.dot(coe, S)
            g = np.dot(coe, G)
            lossbefore = np.dot(coe, mr_lossbefore)
            #################################################################

            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / adj_max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            grad_norm[num_reward] = np.linalg.norm(fullstep)
            grad_save.append(grad_norm)
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()

            def compute_mr_losses():
                mr_losses = np.zeros((num_reward, len(loss_names)))
                for i in range(num_reward):
                    args = seg["ob"], seg["ac"], atarg[:, i]
                    one_reward_loss = allmean(np.array(compute_losses(*args)))
                    mr_losses[i] = one_reward_loss
                mr_loss = np.dot(coe, mr_losses)
                return mr_loss, mr_losses

            # 做10次搜索
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                mr_loss_new, mr_losses_new = compute_mr_losses()
                mr_impro = mr_losses_new - mr_lossbefore
                meanlosses = surr, kl, *_ = allmean(np.array(mr_loss_new))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > adj_max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    impro_save.append(np.hstack((mr_impro[:, 0], improve)))
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

            with timed("vf"):
                #print('======================================= tdlamret ====================================')
                #print(seg["tdlamret"])
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=64):
                        #with tf.Session() as sess:
                        #    sess.run(tf.global_variables_initializer())
                        #    aaa = sess.run(pi.vf,feed_dict={ob:mbob,ret:mbret})
                        #    print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
                        #    print(aaa.shape)
                        #    print(mbret.shape)
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)
        except:
            print('error')
            #print(mbob,mbret)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if rank == 0:
            logger.dump_tabular()
        #pdb.set_trace()
    np.save(save_dir + 'coe.npy', coe_save)
    np.save(save_dir + 'grad.npy', grad_save)
    np.save(save_dir + 'improve.npy', impro_save)
    np.save(save_dir + 'adj.npy', adj_save)
    return pi
コード例 #17
0
 def get_Layer_Flat_variables(self, var_list):
     return U.GetFlat(var_list)
コード例 #18
0
def learn(base_env,
          policy_fn, *,
          max_fitness,  # has to be negative, as cmaes consider minization
          popsize,
          gensize,
          truncation_size,
          sigma,
          eval_iters,
          timesteps_per_actorbatch,
          max_timesteps = 0,
          max_episodes = 0,
          max_iters = 0,
          max_seconds = 0,
          seed = 0
          ):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = base_env.observation_space
    ac_space = base_env.action_space
    pi = policy_fn("pi", ob_space, ac_space)  # Construct network for new policy
    best_pi = policy_fn("best_pi", ob_space, ac_space)  # Construct network for new policy
    backup_pi = policy_fn("backup_pi", ob_space, ac_space)  # Construct a network for every individual to adapt during the es evolution

    U.initialize()
    pi_set_from_flat_params = U.SetFromFlat(pi.get_trainable_variables())
    pi_get_flat_params = U.GetFlat(pi.get_trainable_variables())

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer,best_fitness
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen = 100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen = 100)  # rolling buffer for episode rewards


    assign_backup_eq_new = U.function([], [], updates = [tf.assign(backup_v, newv)
                                                      for (backup_v, newv) in zipsame(
            backup_pi.get_variables(), pi.get_variables())])
    assign_new_eq_backup = U.function([], [], updates = [tf.assign(newv, backup_v)
                                                      for (newv, backup_v) in zipsame(
            pi.get_variables(), backup_pi.get_variables())])

    assign_best_eq_pi = U.function([], [], updates = [tf.assign(bestv, newv)
                                                      for (bestv, newv) in zipsame(
            best_pi.get_variables(), pi.get_variables())])


    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0,
                max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Build generator for all solutions
    eval_seq = traj_segment_generator_eval(best_pi, base_env, timesteps_per_actorbatch, stochastic=False)
    actors = []
    best_fitness = 0
    for i in range(popsize):
        newActor = traj_segment_generator(pi, base_env,
                                          timesteps_per_actorbatch,
                                          stochastic = False,
                                          eval_iters = eval_iters, eval_seq=eval_seq)
        actors.append(newActor)

    flatten_weights = pi_get_flat_params()
    indv_len = len(flatten_weights)
    pop = {}
    pop["solutions"] = np.random.randn(popsize, indv_len)
    pop["parents"] = pop["solutions"][:, truncation_size]
    pop["fitness"] = np.empty([popsize, 1], dtype = float)
    gen_counter = 0
    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            logger.log("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            logger.log("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            logger.log("Max iterations (Generations)")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            logger.log("Max time")
            break
        elif gen_counter >= gensize:
            logger.log("Max iterations (Generations)")
            break

        assign_backup_eq_new() # backup current policy
        assign_best_eq_pi() #get the best pi equal to current pi


        cur_lrmult = max(1.0 - float(timesteps_so_far) / (max_timesteps), 1e-8)

        logger.log("********** Generation %i ************" % iters_so_far)
        if iters_so_far == 0:
            eval_seg = eval_seq.__next__()
            rewbuffer.extend(eval_seg["ep_rets"])
            lenbuffer.extend(eval_seg["ep_lens"])
            result_record()

        ob_segs = None
        for i in range(popsize):
            # First generation
            if gen_counter == 0:
                pop["solutions"][i] = flatten_weights + sigma*cur_lrmult * np.random.normal(0.0, 1.0, indv_len)
                pi_set_from_flat_params(pop["solutions"][i])
                seg = actors[i].__next__()
                pop["fitness"][i] = np.mean(seg["ep_rets"])
            else:
                if i != 0:
                    k = np.random.randint(1, truncation_size)
                    pop["solutions"][i] = pop["parents"][k] + sigma*cur_lrmult * np.random.normal(0.0, 1.0, indv_len)
                    pi_set_from_flat_params(pop["solutions"][i])
                    seg = actors[i].__next__()
                    pop["fitness"][i] = np.mean(seg["ep_rets"])

            if ob_segs is None:
                ob_segs = {'ob': np.copy(seg['ob'])}
            else:
                ob_segs['ob'] = np.append(ob_segs['ob'], seg['ob'], axis=0)
            assign_new_eq_backup()

        pop["fitness"], real_costs = fitness_normalization(pop["fitness"])
        fit_idx = pop["fitness"].flatten().argsort()[::-1][:popsize]
        pop["solutions"] = pop["solutions"][fit_idx]
        pop["parents"] = pop["solutions"][:, truncation_size]
        pop["fitness"] = pop["fitness"][fit_idx]
        # print(pop["fitness"])
        # pop["fitness"], real_fitness = fitness_normalization(pop["fitness"][fit_idx])
        # logger.log("Best Solution Fitness:", pop["fitness"][0])
        pi_set_from_flat_params(pop["solutions"][0])

        ob = ob_segs["ob"]
        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for observation normalization

        gen_counter += 1
        iters_so_far += 1
コード例 #19
0
    def _init(self,
              ob_name,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              init_std=1.0,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        self.varphi_dim = hid_size

        self.ob = utils.get_placeholder(name=ob_name,
                                        dtype=tf.float32,
                                        shape=[sequence_length] +
                                        list(ob_space.shape))
        # self.ob = utils.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        # self.ob = tf.placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=utils.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=utils.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            # Create 'num_hid_layers' hidden layers
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=utils.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                self.action_dim = ac_space.shape[0]

                # mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

                self.dist_diagonal = True
                self.varphi = last_out
                self.varphi_dim = hid_size

                if self.dist_diagonal:
                    stddev_init = np.ones([1, self.action_dim]) * init_std
                    prec_init = 1. / (np.multiply(stddev_init, stddev_init)
                                      )  # 1 x |a|
                    self.prec = tf.get_variable(
                        name="prec",
                        shape=[1, self.action_dim],
                        initializer=tf.constant_initializer(prec_init))
                    kt_init = np.ones([self.varphi_dim, self.action_dim
                                       ]) * 0.5 / self.varphi_dim
                    ktprec_init = kt_init * prec_init
                    self.ktprec = tf.get_variable(
                        name="ktprec",
                        shape=[self.varphi_dim, self.action_dim],
                        initializer=tf.constant_initializer(ktprec_init))
                    kt = tf.divide(self.ktprec, self.prec)
                    mean = tf.matmul(last_out, kt)

                    logstd = tf.log(tf.sqrt(1. / self.prec))
                else:
                    # Not implemented yet
                    raise NotImplementedError

                self.prec_get_flat = utils.GetFlat([self.prec])
                self.prec_set_from_flat = utils.SetFromFlat([self.prec])

                self.ktprec_get_flat = utils.GetFlat([self.ktprec])
                self.ktprec_set_from_flat = utils.SetFromFlat([self.ktprec])

                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=utils.normc_initializer(0.01))

        self.scope = tf.get_variable_scope().name

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = utils.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = utils.function([stochastic, self.ob], [ac, self.vpred])

        # Get all policy parameters
        vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 self.scope + '/pol')
        # Remove log-linear parameters ktprec and prec to get only non-linear parameters
        del vars[-1]
        del vars[-1]
        beta_params = vars

        # Flat w_beta
        beta_len = np.sum(
            [np.prod(p.get_shape().as_list()) for p in beta_params])
        w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len])

        # Unflatten w_beta
        beta_shapes = list(map(tf.shape, beta_params))
        w_beta_unflat_var = self.unflatten_tensor_variables(
            w_beta_var, beta_shapes)

        # w_beta^T * \grad_beta \varphi(s)^T
        v = tf.placeholder(dtype=self.varphi.dtype,
                           shape=self.varphi.get_shape(),
                           name="v_in_Rop")
        features_beta = self.alternative_Rop(self.varphi, beta_params,
                                             w_beta_unflat_var, v)

        self.features_beta = utils.function([self.ob, w_beta_var, v],
                                            features_beta)
コード例 #20
0
ファイル: weight_hyperpolicy.py プロジェクト: T3p/baselines
    def _init(self,
              ob_space,
              ac_space,
              hid_layers=[],
              deterministic=True,
              diagonal=True,
              use_bias=False,
              use_critic=False,
              seed=None,
              verbose=True,
              zero_init=False):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid__layers: list with width of each hidden layer
            deterministic: whether the actor is deterministic
            diagonal: whether the higher order policy has a diagonal covariance
            matrix
            use_bias: whether to include bias in neurons
            use_critic: whether to include a critic network
            seed: optional random seed
        """
        assert isinstance(ob_space, gym.spaces.Box)
        assert len(ac_space.shape) == 1
        self.diagonal = diagonal
        self.use_bias = use_bias
        batch_length = None  #Accepts a sequence of episodes of arbitrary length
        self.observation_space = ob_space
        self.action_space = ac_space
        self.ac_dim = ac_space.shape[0]
        self.ob_dim = ob_space.shape[0]
        self.hid_layers = hid_layers
        self.deterministic = deterministic
        self.use_critic = use_critic
        self.linear = not hid_layers
        self.verbose = verbose

        if seed is not None:
            set_global_seeds(seed)

        self._ob = ob = U.get_placeholder(name="ob",
                                          dtype=tf.float32,
                                          shape=[None] + list(ob_space.shape))

        #Critic (normally not used)
        if use_critic:
            with tf.variable_scope('critic'):
                last_out = ob
                for i, hid_size in enumerate(hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

        #Actor (N.B.: weight initialization is irrelevant)
        with tf.variable_scope('actor'):
            last_out = ob
            for i, hid_size in enumerate(hid_layers):
                #Mlp feature extraction
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=tf.initializers.constant(0.),
                        use_bias=use_bias))
            if deterministic and isinstance(ac_space, gym.spaces.Box):
                #Determinisitc action selection
                self.actor_mean = actor_mean = tf.layers.dense(
                    last_out,
                    ac_space.shape[0],
                    name='action',
                    kernel_initializer=tf.initializers.constant(0.),
                    use_bias=use_bias)
            else:
                raise NotImplementedError  #Currently supports only deterministic action policies

        #Higher order policy (Gaussian)
        with tf.variable_scope('actor') as scope:
            self.actor_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)
            self.flat_actor_weights = tf.concat([tf.reshape(w, [-1]) for w in \
                                            self.actor_weights], axis=0) #flatten
            self._n_actor_weights = n_actor_weights = self.flat_actor_weights.shape[
                0]

        with tf.variable_scope('higher'):
            if zero_init:
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.zeros(shape=[n_actor_weights.value]),
                    tf.zeros(shape=[n_actor_weights]))
            else:
                #Initial means sampled from a normal distribution N(0,1)
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.random_normal(shape=[n_actor_weights.value],
                                     stddev=0.01),
                    tf.zeros(shape=[n_actor_weights]))
            self.higher_mean = higher_mean = tf.get_variable(
                name='higher_mean', initializer=higher_mean_init)

            if diagonal:
                #Diagonal covariance matrix; all stds initialized to 0
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat(
                    [higher_mean, higher_mean * 0. + higher_logstd], axis=0)
                self.pdtype = pdtype = DiagGaussianPdType(
                    n_actor_weights.value)
            else:
                #Cholesky covariance matrix
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights * (n_actor_weights + 1) // 2],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat([higher_mean, higher_logstd], axis=0)
                self.pdtype = pdtype = CholeskyGaussianPdType(
                    n_actor_weights.value)

        #Sample actor weights
        self.pd = pdtype.pdfromflat(pdparam)
        sampled_actor_params = self.pd.sample()
        symm_sampled_actor_params = self.pd.sample_symmetric()
        self._sample_symm_actor_params = U.function(
            [], list(symm_sampled_actor_params))
        self._sample_actor_params = U.function([], [sampled_actor_params])

        #Assign actor weights
        with tf.variable_scope('actor') as scope:
            actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)

            self._use_sampled_actor_params = U.assignFromFlat(
                actor_params, sampled_actor_params)

            self._set_actor_params = U.SetFromFlat(actor_params)

            self._get_actor_params = U.GetFlat(actor_params)

        #Act
        self._action = action = actor_mean
        self._act = U.function([ob], [action])

        #Higher policy weights
        with tf.variable_scope('higher') as scope:
            self._higher_params = higher_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)
            self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in \
                                            self._higher_params], axis=0) #flatten
            self._n_higher_params = self.flat_higher_params.shape[0]
            self._get_flat_higher_params = U.GetFlat(higher_params)
            self._set_higher_params = U.SetFromFlat(self._higher_params)

        #Batch PGPE
        self._actor_params_in = actor_params_in = \
                U.get_placeholder(name='actor_params_in',
                                  dtype=tf.float32,
                                  shape=[batch_length] + [n_actor_weights])
        self._rets_in = rets_in = U.get_placeholder(name='returns_in',
                                                    dtype=tf.float32,
                                                    shape=[batch_length])
        ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0])
        self._get_ret_mean = U.function([self._rets_in], [ret_mean])
        self._get_ret_std = U.function([self._rets_in], [ret_std])
        self._logprobs = logprobs = self.pd.logp(actor_params_in)
        pgpe_times_n = U.flatgrad(logprobs * rets_in, higher_params)
        self._get_pgpe_times_n = U.function([actor_params_in, rets_in],
                                            [pgpe_times_n])

        #One-episode PGPE
        #Used N times to compute the baseline -> can we do better?
        self._one_actor_param_in = one_actor_param_in = U.get_placeholder(
            name='one_actor_param_in',
            dtype=tf.float32,
            shape=[n_actor_weights])
        one_logprob = self.pd.logp(one_actor_param_in)
        score = U.flatgrad(one_logprob, higher_params)
        score_norm = tf.norm(score)
        self._get_score = U.function([one_actor_param_in], [score])
        self._get_score_norm = U.function([one_actor_param_in], [score_norm])

        #Batch off-policy PGPE
        self._probs = tf.exp(logprobs)
        self._behavioral = None
        self._renyi_other = None

        #One episode off-PGPE
        self._one_prob = tf.exp(one_logprob)

        #Renyi computation
        self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd))

        #Fisher computation (diagonal case)
        mean_fisher_diag = tf.exp(-2 * self.higher_logstd)
        cov_fisher_diag = mean_fisher_diag * 0 + 2
        self._fisher_diag = tf.concat([mean_fisher_diag, cov_fisher_diag],
                                      axis=0)
        self._get_fisher_diag = U.function([], [self._fisher_diag])

        #Multiple importance sampling
        self._memory = None
コード例 #21
0
def learn_hoof_a2c(
        network,
        env,
        seed=None,
        nsteps=5,
        total_timesteps=int(80e6),
        vf_coef=0.5,
        ent_coef=0.01,
        max_grad_norm=0.5,
        lr=7e-4,
        lrschedule='linear',
        epsilon=1e-5,
        alpha=0.99,
        gamma=0.99,
        log_interval=100,
        load_path=None,  # Baselines default settings till here
        optimiser='RMSProp',
        lr_upper_bound=None,
        ent_upper_bound=None,
        num_lr=None,
        num_ent_coeff=None,
        max_kl=-1.0,  # -1.0 is for no KL constraint
        **network_kwargs):
    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # overwrite default params if using HOOF
    if lr_upper_bound is not None:
        lr = 1.0
        lrschedule = 'constant'
    else:
        num_lr = 1

    if ent_upper_bound is None:
        num_ent_coeff = 1

    # Instantiate the model object (that creates step_model and train_model)
    model = HOOF_Model(
        policy=policy,
        env=env,
        nsteps=nsteps,
        optimiser=optimiser,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        total_timesteps=total_timesteps,
        alpha=alpha,
        epsilon=epsilon  # defaults for RMSProp
    )

    runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # model helper functions
    model_params = find_trainable_variables("a2c_model")
    get_flat = U.GetFlat(model_params)
    set_from_flat = U.SetFromFlat(model_params)

    # for Gaussian policies
    def kl(new_mean, new_sd, old_mean, old_sd):
        approx_kl = np.log(new_sd / old_sd) + (
            old_sd**2 +
            (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5
        approx_kl = np.sum(approx_kl, axis=1)
        approx_kl = np.mean(approx_kl)
        return approx_kl

    if max_kl == -1.0:  # set max kl to a high val in case there is no constraint
        max_kl = 10**8

    # Start total timer
    tstart = time.time()

    for update in range(1, int(total_timesteps // nbatch + 1)):
        obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run(
        )
        epinfobuf.extend(epinfos)
        old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions)
        for step in range(len(obs)):
            cur_lr = lr.value()

        opt_pol_val = -10**8
        old_params = get_flat()
        rms_weights_before_upd = model.get_opt_state()
        approx_kl = np.zeros((num_ent_coeff, num_lr))
        epv = np.zeros((num_ent_coeff, num_lr))
        rand_lr = lr_upper_bound * np.random.rand(
            num_lr) if lr_upper_bound is not None else [cur_lr]
        rand_lr = np.sort(rand_lr)
        rand_ent_coeff = ent_upper_bound * np.random.rand(
            num_ent_coeff) if ent_upper_bound is not None else [ent_coef]

        for nec in range(num_ent_coeff):
            # reset policy and optimiser
            set_from_flat(old_params)
            model.set_opt_state(rms_weights_before_upd)

            # get grads for loss fn with given entropy coeff
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values,
                rand_ent_coeff[nec])
            new_params = get_flat()
            ent_grads = new_params - old_params

            # enumerate over different LR
            for nlr in range(num_lr):
                new_params = old_params + rand_lr[nlr] * ent_grads
                set_from_flat(new_params)
                new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll(
                    obs, actions)
                lik_ratio = np.exp(-new_neg_ll + old_neg_ll)
                est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds,
                                           lik_ratio)
                approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd)
                epv[nec, nlr] = est_pol_val

                if (nec == 0
                        and nlr == 0) or (est_pol_val > opt_pol_val
                                          and approx_kl[nec, nlr] < max_kl):
                    opt_pol_val = est_pol_val
                    opt_pol_params = get_flat()
                    opt_rms_wts = model.get_opt_state()
                    opt_lr = rand_lr[nlr]
                    opt_ent_coeff = rand_ent_coeff[nec]
                    opt_kl = approx_kl[nec, nlr]

        # update policy and rms prop to optimal wts
        set_from_flat(opt_pol_params)
        model.set_opt_state(opt_rms_wts)

        # Shrink LR search space if too many get rejected
        if lr_upper_bound is not None:
            rejections = np.sum(approx_kl > max_kl) / num_lr
            if rejections > 0.8:
                lr_upper_bound *= 0.8
            if rejections == 0:
                lr_upper_bound *= 1.25

        nseconds = time.time() - tstart

        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("opt_lr", float(opt_lr))
            logger.record_tabular("opt_ent_coeff", float(opt_ent_coeff))
            logger.record_tabular("approx_kl", float(opt_kl))
            if lr_upper_bound is not None:
                logger.record_tabular("rejections", rejections)
                logger.record_tabular("lr_ub", lr_upper_bound)
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
コード例 #22
0
def ars_optimize(env, policy_func, perturb_mag, learning_rate, eval_epoch,
                 params_per_thread, top_perturb, maxiter, policy_scope="pi",
                 init_policy_params = None, eval_func = evaluate_performance, callback=None,
                 skilldim=3, task_num=5, inner_iters=10
                 ):
    ob_space = env.observation_space
    ac_space = env.action_space
    from gym import spaces
    obs_dim_base = ob_space.low.shape[0] + skilldim
    high = np.inf * np.ones(obs_dim_base)
    low = -high
    skill_ob_space = spaces.Box(low, high)

    np.random.seed(MPI.COMM_WORLD.Get_rank() * 1023+1)

    pi = policy_func(policy_scope, skill_ob_space, ac_space)  # Construct network for new policy

    pol_var_list = [v for v in pi.get_trainable_variables() if 'pol' in v.name and 'logstd' not in v.name]
    pol_var_size = np.sum([np.prod(v.shape) for v in pol_var_list])
    print(pol_var_list)
    print('pol_var_size: ', pol_var_size)

    get_pol_flat = U.GetFlat(pol_var_list)
    set_pol_from_flat = U.SetFromFlat(pol_var_list)

    adam = MpiAdam(pol_var_list)

    U.initialize()

    adam.sync()

    if init_policy_params is not None:
        cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name.find('/')]
        orig_scope = list(init_policy_params.keys())[0][0:list(init_policy_params.keys())[0].find('/')]
        print(cur_scope, orig_scope)
        for i in range(len(pi.get_variables())):
            if pi.get_variables()[i].name.replace(cur_scope, orig_scope, 1) in init_policy_params:
                assign_op = pi.get_variables()[i].assign(
                    init_policy_params[pi.get_variables()[i].name.replace(cur_scope, orig_scope, 1)])
                tf.get_default_session().run(assign_op)

    # Used to construct the task-embedding mapping
    skill_optimizer = UPOptimizer(env, pi, skilldim, eval_num=1, verbose=False, bayesian_opt=True)

    param_dim = len(get_pol_flat())
    episodes_so_far = 0
    iters_so_far = 0
    current_parameters = np.copy(get_pol_flat())
    for it in range(maxiter):
        logger.log("********** Iteration %i ************" % it)

        if callback is not None:
            callback(locals(), globals())
        current_parameters = np.copy(get_pol_flat())

        perturbations = []
        performances = []
        all_obs = []

        if it % inner_iters == 0:
            logger.log("==== Reconstruct task set at iteration %i ======" % it)
            # Construct the task_embedding first
            task_embeddings = []
            for task_id in range(int(np.max([task_num / MPI.COMM_WORLD.Get_size(), 1]))):
                task_parameters = env.env.env.resample_task()
                optimized_embedding = None
                skill_optimizer.reset()
                if skilldim > 0:
                    skill_optimizer.optimize(maxiter=20, max_steps=50000, custom_bound=[-1.0, 1.0])
                    optimized_embedding = skill_optimizer.best_x
                    print(task_parameters, optimized_embedding, skill_optimizer.best_f)
                else:
                    optimized_embedding = []
                task_embeddings.append([task_parameters, optimized_embedding])
                print(optimized_embedding)
            all_task_embeddings = MPI.COMM_WORLD.allgather(task_embeddings)
            task_embeddings = []
            for te in all_task_embeddings:
                task_embeddings += te
            task_embeddings = task_embeddings[0:task_num]

        for sp in range(params_per_thread):
            sampled_perturbation = np.random.normal(0, 1, param_dim)

            positive_perf = []
            negative_perf = []
            for task_emb in task_embeddings:
                pert_param = current_parameters + sampled_perturbation * perturb_mag
                obs, positive_pert_performance, episodes = eval_func(env, pi, pert_param, set_pol_from_flat, eval_epoch, task_emb[0], task_emb[1])
                all_obs += obs
                episodes_so_far += episodes
                positive_perf.append(positive_pert_performance)

                pert_param = current_parameters - sampled_perturbation * perturb_mag
                obs, negative_pert_performance, episodes = eval_func(env, pi, pert_param, set_pol_from_flat, eval_epoch, task_emb[0], task_emb[1])
                all_obs += obs
                episodes_so_far += episodes
                negative_perf.append(negative_pert_performance)

            perturbations.append(sampled_perturbation)
            performances.append([np.mean(positive_perf), np.mean(negative_perf)])
        all_performances = np.concatenate(MPI.COMM_WORLD.allgather(performances), axis=0)
        all_perturbations = np.concatenate(MPI.COMM_WORLD.allgather(perturbations), axis=0)

        max_perf_list = np.max(all_performances, axis=1)
        max_perf_list.sort()
        top_k_perf = max_perf_list[-top_perturb]

        parameter_update = current_parameters * 0
        utilized_rewards = []
        for sp in range(len(all_performances)):
            if np.max(all_performances[sp]) >= top_k_perf:
                parameter_update += all_perturbations[sp] * (all_performances[sp][0] - all_performances[sp][1])
                utilized_rewards.append(all_performances[sp][0])
                utilized_rewards.append(all_performances[sp][1])
        parameter_update /= top_perturb

        lr_scaler = np.std(utilized_rewards)

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(np.array(all_obs))  # update running mean/std for policy

        delta_parameters = parameter_update * learning_rate / lr_scaler
        current_parameters += delta_parameters
        set_pol_from_flat(current_parameters)
        total_episodes_sofar = np.sum(MPI.COMM_WORLD.allgather(episodes_so_far))
        final_performance = []
        for task_emb in task_embeddings:
            obs, perf, episodes = eval_func(env, pi, current_parameters, set_pol_from_flat, eval_epoch, task_emb[0], task_emb[1])
            final_performance.append(perf)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.record_tabular('EpRewMean', np.mean(final_performance))
            logger.record_tabular('EpisodesSoFar', total_episodes_sofar)
            logger.record_tabular('Parameter magnitude', np.linalg.norm(current_parameters))
            logger.record_tabular('Delta magnitude', np.linalg.norm(delta_parameters))
            logger.dump_tabular()
        iters_so_far += 1

    print('Optimized parameter: ', current_parameters)
コード例 #23
0
def learn(
        *,
        network,
        env,
        eval_env,
        make_eval_env,
        env_id,
        total_timesteps,
        timesteps_per_batch,
        sil_update,
        sil_loss,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        lr=3e-4,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=5,
        sil_value=0.01,
        sil_alpha=0.6,
        sil_beta=0.1,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        save_interval=0,
        load_path=None,
        # MBL
        # For train mbl
        mbl_train_freq=5,

        # For eval
        num_eval_episodes=5,
        eval_freq=5,
        vis_eval=False,
        #eval_targs=('mbmf',),
        eval_targs=('mf', ),
        quant=2,

        # For mbl.step
        #num_samples=(1500,),
        num_samples=(1, ),
        horizon=(2, ),
        #horizon=(2,1),
        #num_elites=(10,),
        num_elites=(1, ),
        mbl_lamb=(1.0, ),
        mbl_gamma=0.99,
        #mbl_sh=1, # Number of step for stochastic sampling
        mbl_sh=10000,
        #vf_lookahead=-1,
        #use_max_vf=False,
        reset_per_step=(0, ),

        # For get_model
        num_fc=2,
        num_fwd_hidden=500,
        use_layer_norm=False,

        # For MBL
        num_warm_start=int(1e4),
        init_epochs=10,
        update_epochs=5,
        batch_size=512,
        update_with_validation=False,
        use_mean_elites=1,
        use_ent_adjust=0,
        adj_std_scale=0.5,

        # For data loading
        validation_set_path=None,

        # For data collect
        collect_val_data=False,

        # For traj collect
        traj_collect='mf',

        # For profile
        measure_time=True,
        eval_val_err=False,
        measure_rew=True,
        model_fn=None,
        update_fn=None,
        init_fn=None,
        mpi_rank_weight=1,
        comm=None,
        vf_coef=0.5,
        max_grad_norm=0.5,
        log_interval=1,
        nminibatches=4,
        noptepochs=4,
        cliprange=0.2,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''
    if not isinstance(num_samples, tuple): num_samples = (num_samples, )
    if not isinstance(horizon, tuple): horizon = (horizon, )
    if not isinstance(num_elites, tuple): num_elites = (num_elites, )
    if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, )
    if not isinstance(reset_per_step, tuple):
        reset_per_step = (reset_per_step, )
    if validation_set_path is None:
        if collect_val_data:
            validation_set_path = os.path.join(logger.get_dir(), 'val.pkl')
        else:
            validation_set_path = os.path.join('dataset',
                                               '{}-val.pkl'.format(env_id))
    if eval_val_err:
        eval_val_err_path = os.path.join('dataset',
                                         '{}-combine-val.pkl'.format(env_id))
    logger.log(locals())
    logger.log('MBL_SH', mbl_sh)
    logger.log('Traj_collect', traj_collect)

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0
    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    set_global_seeds(seed)
    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    nenvs = env.num_envs
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * timesteps_per_batch
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
        make_model = lambda: Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=timesteps_per_batch,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            sil_update=sil_update,
            sil_value=sil_value,
            sil_alpha=sil_alpha,
            sil_beta=sil_beta,
            sil_loss=sil_loss,
            #                                    fn_reward=env.process_reward,
            fn_reward=None,
            #                                    fn_obs=env.process_obs,
            fn_obs=None,
            ppo=False,
            prev_pi='pi',
            silm=pi)
        model = make_model()
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)
        make_old_model = lambda: Model(
            policy=policy,
            ob_space=ob_space,
            ac_space=ac_space,
            nbatch_act=nenvs,
            nbatch_train=nbatch_train,
            nsteps=timesteps_per_batch,
            ent_coef=ent_coef,
            vf_coef=vf_coef,
            max_grad_norm=max_grad_norm,
            sil_update=sil_update,
            sil_value=sil_value,
            sil_alpha=sil_alpha,
            sil_beta=sil_beta,
            sil_loss=sil_loss,
            #                                    fn_reward=env.process_reward,
            fn_reward=None,
            #                                    fn_obs=env.process_obs,
            fn_obs=None,
            ppo=False,
            prev_pi='oldpi',
            silm=oldpi)
        old_model = make_old_model()

    # MBL
    # ---------------------------------------
    #viz = Visdom(env=env_id)
    win = None
    eval_targs = list(eval_targs)
    logger.log(eval_targs)

    make_model_f = get_make_mlp_model(num_fc=num_fc,
                                      num_fwd_hidden=num_fwd_hidden,
                                      layer_norm=use_layer_norm)
    mbl = MBL(env=eval_env,
              env_id=env_id,
              make_model=make_model_f,
              num_warm_start=num_warm_start,
              init_epochs=init_epochs,
              update_epochs=update_epochs,
              batch_size=batch_size,
              **network_kwargs)

    val_dataset = {'ob': None, 'ac': None, 'ob_next': None}
    if update_with_validation:
        logger.log('Update with validation')
        val_dataset = load_val_data(validation_set_path)
    if eval_val_err:
        logger.log('Log val error')
        eval_val_dataset = load_val_data(eval_val_err_path)
    if collect_val_data:
        logger.log('Collect validation data')
        val_dataset_collect = []

    def _mf_pi(ob, t=None):
        stochastic = True
        ac, vpred, _, _ = pi.step(ob, stochastic=stochastic)
        return ac, vpred

    def _mf_det_pi(ob, t=None):
        #ac, vpred, _, _ = pi.step(ob, stochastic=False)
        ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob)
        return ac, vpred

    def _mf_ent_pi(ob, t=None):
        mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob)
        ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape)
        return ac, vpred
################### use_ent_adjust======> adj_std_scale????????pi action sample

    def _mbmf_inner_pi(ob, t=0):
        if use_ent_adjust:
            return _mf_ent_pi(ob)
        else:
            #return _mf_pi(ob)
            if t < mbl_sh: return _mf_pi(ob)
            else: return _mf_det_pi(ob)

    # ---------------------------------------

    # Run multiple configuration once
    all_eval_descs = []

    def make_mbmf_pi(n, h, e, l):
        def _mbmf_pi(ob):
            ac, rew = mbl.step(ob=ob,
                               pi=_mbmf_inner_pi,
                               horizon=h,
                               num_samples=n,
                               num_elites=e,
                               gamma=mbl_gamma,
                               lamb=l,
                               use_mean_elites=use_mean_elites)
            return ac[None], rew

        return Policy(step=_mbmf_pi, reset=None)

    for n in num_samples:
        for h in horizon:
            for l in mbl_lamb:
                for e in num_elites:
                    if 'mbmf' in eval_targs:
                        all_eval_descs.append(('MeanRew', 'MBL_TRPO_SIL',
                                               make_mbmf_pi(n, h, e, l)))
                    #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l)))
    if 'mf' in eval_targs:
        all_eval_descs.append(
            ('MeanRew', 'TRPO_SIL', Policy(step=_mf_pi, reset=None)))

    logger.log('List of evaluation targets')
    for it in all_eval_descs:
        logger.log(it[0])

    pool = Pool(mp.cpu_count())
    warm_start_done = False
    # ----------------------------------------

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)
    # Prepare for rollouts
    # ----------------------------------------
    if traj_collect == 'mf':
        seg_gen = traj_segment_generator(env,
                                         timesteps_per_batch,
                                         model,
                                         stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
            if traj_collect == 'mf-random' or traj_collect == 'mf-mb':
                seg_mbl = seg_gen_mbl.__next__()
            else:
                seg_mbl = seg
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]

        # Val data collection
        if collect_val_data:
            for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...],
                                          ob[1:, 0, ...]):
                val_dataset_collect.append(
                    (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_)))
        # -----------------------------
        # MBL update
        else:
            ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"]

            mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...],
                               ob_mbl[1:, 0, ...])
            mbl.update_forward_dynamic(require_update=iters_so_far %
                                       mbl_train_freq == 0,
                                       ob_val=val_dataset['ob'],
                                       ac_val=val_dataset['ac'],
                                       ob_next_val=val_dataset['ob_next'])
        # -----------------------------

        if traj_collect == 'mf':
            #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb':
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            model = seg["model"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
            if hasattr(pi, "rms"):
                pi.rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            def fisher_vector_product(p):
                return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

            with timed("vf"):

                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=64):
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)
            with timed("SIL"):
                lrnow = lr(1.0 - timesteps_so_far / total_timesteps)
                l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train(
                    lrnow)

            logger.record_tabular("ev_tdlam_before",
                                  explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if sil_update > 0:
            logger.record_tabular("SilSamples", sil_samples)

        if rank == 0:
            # MBL evaluation
            if not collect_val_data:
                #set_global_seeds(seed)
                default_sess = tf.get_default_session()

                def multithread_eval_policy(env_, pi_, num_episodes_,
                                            vis_eval_, seed):
                    with default_sess.as_default():
                        if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'):
                            env_.ob_rms = env.ob_rms
                        res = eval_policy(env_, pi_, num_episodes_, vis_eval_,
                                          seed, measure_time, measure_rew)

                        try:
                            env_.close()
                        except:
                            pass
                    return res

                if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0:
                    warm_start_done = mbl.is_warm_start_done()
                    if num_eval_episodes > 0:
                        targs_names = {}
                        with timed('eval'):
                            num_descs = len(all_eval_descs)
                            list_field_names = [e[0] for e in all_eval_descs]
                            list_legend_names = [e[1] for e in all_eval_descs]
                            list_pis = [e[2] for e in all_eval_descs]
                            list_eval_envs = [
                                make_eval_env() for _ in range(num_descs)
                            ]
                            list_seed = [seed for _ in range(num_descs)]
                            list_num_eval_episodes = [
                                num_eval_episodes for _ in range(num_descs)
                            ]
                            print(list_field_names)
                            print(list_legend_names)

                            list_vis_eval = [
                                vis_eval for _ in range(num_descs)
                            ]

                            for i in range(num_descs):
                                field_name, legend_name = list_field_names[
                                    i], list_legend_names[i],

                                res = multithread_eval_policy(
                                    list_eval_envs[i], list_pis[i],
                                    list_num_eval_episodes[i],
                                    list_vis_eval[i], seed)
                                #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed))

                                #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results):
                                perf, elapsed_time, eval_rew = res
                                logger.record_tabular(field_name, perf)
                                if measure_time:
                                    logger.record_tabular(
                                        'Time-%s' % (field_name), elapsed_time)
                                if measure_rew:
                                    logger.record_tabular(
                                        'SimRew-%s' % (field_name), eval_rew)
                                targs_names[field_name] = legend_name

                    if eval_val_err:
                        fwd_dynamics_err = mbl.eval_forward_dynamic(
                            obs=eval_val_dataset['ob'],
                            acs=eval_val_dataset['ac'],
                            obs_next=eval_val_dataset['ob_next'])
                        logger.record_tabular('FwdValError', fwd_dynamics_err)

                    logger.dump_tabular()
                    #print(logger.get_dir())
                    #print(targs_names)
                    #if num_eval_episodes > 0:


#                        win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best')
# -----------
#logger.dump_tabular()
        yield pi

    if collect_val_data:
        with open(validation_set_path, 'wb') as f:
            pickle.dump(val_dataset_collect, f)
        logger.log('Save {} validation data'.format(len(val_dataset_collect)))
コード例 #24
0
def learn(env_name, make_env, seed, make_policy, *,
          max_iters,
          horizon,
          drho,
          delta,
          gamma,
          multiple_init=None,
          sampler=None,
          feature_fun=None,
          iw_norm='none',
          bound_type='max-ess',
          max_offline_iters=10,
          save_weights=False,
          render_after=None,
          grid_size_1d=None,
          mu_min=None,
          mu_max=None,
          truncated_mise=True,
          delta_t=None,
          k=2,
          filename=None,
          find_optimal_arm=False,
          plot_bound=False,
          plot_ess_profile=False,
          trainable_std=False,
          rescale_ep_return=False):
    """
    Learns a policy from scratch
        make_env: environment maker
        make_policy: policy maker
        horizon: max episode length
        delta: probability of failure
        gamma: discount factor
        max_iters: total number of learning iteration
    """

    # Print options
    np.set_printoptions(precision=3)
    losses_with_name = []

    # Build the environment
    env = make_env()
    ob_space = env.observation_space
    ac_space = env.action_space

    env.seed(seed)

    # Build the higher level target and behavioral policies
    pi = make_policy('pi', ob_space, ac_space)
    oldpi = make_policy('oldpi', ob_space, ac_space)
    logger.record_tabular('NumTrainableParams', int(pi._n_higher_params))

    # Get all pi's learnable parameters
    all_var_list = pi.get_trainable_variables()
    var_list = \
        [v for v in all_var_list if v.name.split('/')[1].startswith('higher')]
    shapes = [U.intprod(var.get_shape().as_list()) for var in var_list]
    d = sum(shapes)
    # Get all oldpi's learnable parameters
    all_var_list_old = oldpi.get_trainable_variables()
    var_list_old = \
        [v for v in all_var_list_old
         if v.name.split('/')[1].startswith('higher')]
    # Get hyperpolicy's logstd
    higher_logstd_list = [pi.get_higher_logstd()]

    # My Placeholders
    actor_params_ = tf.placeholder(shape=[max_iters, pi._n_actor_weights],
                                   name='actor_params', dtype=tf.float32)
    last_actor_param_ = tf.placeholder(shape=(pi._n_actor_weights),
                                       name='last_actor_params',
                                       dtype=tf.float32)
    den_mise_log_ = tf.placeholder(shape=[max_iters], dtype=tf.float32,
                                   name='den_mise')
    renyi_bound_ = tf.placeholder(dtype=tf.float32, name='renyi_bound')
    ret_ = tf.placeholder(dtype=tf.float32, shape=(max_iters), name='ret')
    disc_ret_ = tf.placeholder(dtype=tf.float32, shape=(max_iters),
                               name='disc_ret')
    n_ = tf.placeholder(dtype=tf.float32, name='iter_number')
    n_int = tf.cast(n_, dtype=tf.int32)
    mask_iters_ = tf.placeholder(dtype=tf.float32, shape=(max_iters),
                                 name='mask_iters')
    # grad_ = tf.placeholder(dtype=tf,.float32,
    #                            shape=(d, 1), name='grad')

    # Multiple importance weights (with balance heuristic)
    target_log_pdf = tf.reduce_sum(
        pi.pd.independent_logps(actor_params_), axis=1)
    behavioral_log_pdf = tf.reduce_sum(
        oldpi.pd.independent_logps(actor_params_), axis=1)
    behavioral_log_pdf_last_sample = tf.reduce_sum(
        oldpi.pd.independent_logps(last_actor_param_))
    log_ratio = target_log_pdf - den_mise_log_
    miw = tf.exp(log_ratio) * mask_iters_

    den_mise_log_mean = tf.reduce_sum(den_mise_log_) / n_
    den_mise_log_last = den_mise_log_[n_int-1]
    losses_with_name.extend([(den_mise_log_mean, 'DenMISEMeanLog'),
                             (den_mise_log_[0], 'DenMISELogFirst'),
                             (den_mise_log_last, 'DenMISELogLast'),
                             (miw[0], 'IWFirstEpisode'),
                             (miw[n_int-1], 'IWLastEpisode'),
                             (tf.reduce_sum(miw)/n_, 'IWMean'),
                             (tf.reduce_max(miw), 'IWMax'),
                             (tf.reduce_min(miw), 'IWMin')])

    # Return
    ep_return = disc_ret_
    return_mean = tf.reduce_sum(ep_return) / n_
    return_last = ep_return[n_int - 1]
    return_max = tf.reduce_max(ep_return[:n_int])
    return_min = tf.reduce_min(ep_return[:n_int])
    return_abs_max = tf.reduce_max(tf.abs(ep_return[:n_int]))
    regret = n_ * 5 - tf.reduce_sum(ep_return)
    regret_over_t = 5 - return_mean

    losses_with_name.extend([(return_mean, 'ReturnMean'),
                             (return_max, 'ReturnMax'),
                             (return_min, 'ReturnMin'),
                             (return_last, 'ReturnLastEpisode'),
                             (return_abs_max, 'ReturnAbsMax'),
                             (regret, 'Regret'),
                             (regret_over_t, 'Regret/t')])

    # Regret
    # Exponentiated Renyi divergence between the target and one behavioral
    renyi_component = pi.pd.renyi(oldpi.pd)
    renyi_component = tf.cond(tf.is_nan(renyi_component),
                              lambda: tf.constant(np.inf),
                              lambda: renyi_component)
    renyi_component = tf.cond(renyi_component < 0.,
                              lambda: tf.constant(np.inf),
                              lambda: renyi_component)
    renyi_component = tf.exp(renyi_component)

    if truncated_mise:
        # Bound to d2(target || mixture of behaviorals)/n
        mn = tf.sqrt((n_**2 * renyi_bound_) / tf.log(1 / delta))
        mn_broadcasted = \
            tf.ones(shape=miw.get_shape().as_list(), dtype=np.float32) * mn
        min = tf.where(tf.less(miw, mn_broadcasted), miw, mn_broadcasted)
        mise = tf.reduce_sum(min * ep_return * mask_iters_)/n_
    else:
        # MISE
        mise = tf.reduce_sum(miw * ep_return * mask_iters_)/n_
        losses_with_name.append((mise, 'MISE'))

    # Bounds
    if delta_t == 'continuous':
        tau = tf.ceil(n_**(1 / k))
        delta_cst = delta
        delta = 6 * delta / ((np.pi * n_)**2 * (1 + tau**d))
    elif delta_t == 'discrete':
        delta_cst = delta
        delta = 3 * delta / ((np.pi * n_)**2 * grid_size_1d)
    elif delta_t is None:
        grid_size_1d = 100  # ToDo correggiiiiiiiiiiiiiiii
        delta_cst = delta
        delta = tf.constant(delta)
    else:
        raise NotImplementedError
    losses_with_name.append((delta, 'Delta'))

    if bound_type == 'J':
        bound = mise
    elif bound_type == 'max-renyi':
        if truncated_mise:
            const = return_abs_max * (np.sqrt(2) + 1 / 3) \
                * tf.sqrt(tf.log(1 / delta))
            exploration_bonus = const * tf.sqrt(renyi_bound_)
            bound = mise + exploration_bonus
        else:
            const = return_abs_max * tf.sqrt(1 / delta - 1)
            exploration_bonus = const * tf.sqrt(renyi_bound_)
            bound = mise + exploration_bonus
    else:
        raise NotImplementedError
    losses_with_name.append((mise, 'BoundMISE'))
    losses_with_name.append((exploration_bonus, 'BoundBonus'))
    losses_with_name.append((bound, 'Bound'))

    # ESS estimation by d2
    ess_d2 = n_ / renyi_bound_
    # ESS estimation by miw norms
    eps = 1e-18  # for eps<1e-18 miw_2=0 if weights are zero
    miw_ess = (tf.exp(log_ratio) + eps) * mask_iters_
    miw_1 = tf.linalg.norm(miw_ess, ord=1)
    miw_2 = tf.linalg.norm(miw_ess, ord=2)
    ess_miw = miw_1**2 / miw_2**2

    # Infos
    losses, loss_names = map(list, zip(*losses_with_name))

    # TF functions
    set_parameters = U.SetFromFlat(var_list)
    get_parameters = U.GetFlat(var_list)
    set_parameters_old = U.SetFromFlat(var_list_old)
    # set_higher_logstd = U.SetFromFlat(higher_logstd_list)
    # set_higher_logstd(np.log([0.15, 0.2]))

    compute_behav = U.function(
        [actor_params_], behavioral_log_pdf)
    compute_behav_last_sample = U.function(
        [last_actor_param_], behavioral_log_pdf_last_sample)
    compute_renyi = U.function(
        [], renyi_component)
    compute_bound = U.function(
        [actor_params_, disc_ret_, ret_, n_,
         mask_iters_, den_mise_log_, renyi_bound_], bound)
    compute_grad = U.function(
        [actor_params_, disc_ret_, ret_, n_,
         mask_iters_, den_mise_log_, renyi_bound_],
        U.flatgrad(bound, var_list))
    compute_return_mean = U.function(
        [actor_params_, disc_ret_, ret_, n_,
         mask_iters_], return_mean)
    compute_losses = U.function(
        [actor_params_, disc_ret_, ret_, n_,
         mask_iters_, den_mise_log_, renyi_bound_], losses)
    compute_roba = U.function(
        [actor_params_, disc_ret_, ret_, n_,
         mask_iters_, den_mise_log_, renyi_bound_],
        [mise, exploration_bonus, ess_d2, ess_miw])

    # Tf initialization
    U.initialize()

    # Store behaviorals' params and their trajectories
    old_rhos_list = []
    all_eps = {}
    all_eps['actor_params'] = np.zeros(shape=[max_iters, pi._n_actor_weights])
    all_eps['disc_ret'] = np.zeros(max_iters)
    all_eps['ret'] = np.zeros(max_iters)
    mask_iters = np.zeros(max_iters)
    # Set learning loop variables
    den_mise = np.zeros(mask_iters.shape).astype(np.float32)
    if delta_t == 'continuous':
        renyi_components_sum = None
    else:
        renyi_components_sum = np.zeros(grid_size_1d**d)
    new_grid = True
    grid_size_1d_old = 0
    iters_so_far = 0
    lens = []
    tstart = time.time()
    # Sample actor's params before entering the learning loop
    rho = get_parameters()
    theta = pi.resample()
    all_eps['actor_params'][iters_so_far, :] = theta
    # Establish grid dimension if needed
    grid_dimension = ob_space.shape[0]
    # Learning loop ###########################################################
    while True:
        iters_so_far += 1
        mask_iters[:iters_so_far] = 1

        # Render one episode
        if render_after is not None and iters_so_far % render_after == 0:
            if hasattr(env, 'render'):
                render(env, pi, horizon)

        # Exit loop in the end
        if iters_so_far - 1 >= max_iters:
            print('Finished...')
            break

        # Learning iteration
        logger.log('********** Iteration %i ************' % iters_so_far)

        # Generate one trajectory
        with timed('sampling'):
            # Sample a trajectory with the newly parametrized actor
            ret, disc_ret, ep_len = eval_trajectory(
                env, pi, gamma, horizon, feature_fun, rescale_ep_return)
            all_eps['ret'][iters_so_far-1] = ret
            all_eps['disc_ret'][iters_so_far-1] = disc_ret
            lens.append(ep_len)

        # Store the parameters of the behavioral hyperpolicy
        old_rhos_list.append(rho)

        with timed('summaries before'):
            logger.record_tabular("Iteration", iters_so_far)
            logger.record_tabular("NumTrajectories", iters_so_far)
            logger.record_tabular("TimestepsSoFar", np.sum(lens))
            logger.record_tabular('AvgEpLen', np.mean(lens))
            logger.record_tabular('MinEpLen', np.min(lens))
            logger.record_tabular("TimeElapsed", time.time() - tstart)

        # Save policy parameters to disk
        if save_weights:
            logger.record_tabular('Weights', str(get_parameters()))
            import pickle
            file = open('checkpoint.pkl', 'wb')
            pickle.dump(rho, file)

        # Tensor evaluations

        def evaluate_behav():
            return compute_behav(all_eps['actor_params'])

        def evaluate_behav_last_sample():
            args_behav_last = [all_eps['actor_params'][iters_so_far - 1]]
            return compute_behav_last_sample(*args_behav_last)

        def evaluate_renyi_component():
            return compute_renyi()

        args = all_eps['actor_params'], all_eps['disc_ret'], \
            all_eps['ret'], iters_so_far, mask_iters

        def evaluate_bound(den_mise_log, renyi_bound):
            args_bound = args + (den_mise_log, renyi_bound, )
            return compute_bound(*args_bound)

        def evaluate_grad(den_mise_log, renyi_bound):
            args_grad = args + (den_mise_log, renyi_bound, )
            return compute_grad(*args_grad)

        def evaluate_roba(den_mise_log, renyi_bound):
            args_roba = args + (den_mise_log, renyi_bound, )
            return compute_roba(*args_roba)

        if bound_type == 'J':
            evaluate_renyi = None
        elif bound_type == 'max-renyi':
            evaluate_renyi = evaluate_renyi_component
        else:
            raise NotImplementedError

        with timed("Optimization"):
            if find_optimal_arm:
                pass
            elif multiple_init:
                bound = 0
                improvement = 0
                check = False
                for i in range(multiple_init):
                    rho_init = [np.arctanh(np.random.uniform(
                        pi.min_mean, pi.max_mean))]
                    rho_i, improvement_i, den_mise_log_i, bound_i = \
                        optimize_offline(evaluate_roba, pi,
                                         rho_init, drho,
                                         old_rhos_list,
                                         iters_so_far,
                                         mask_iters, set_parameters,
                                         set_parameters_old,
                                         evaluate_behav, evaluate_renyi,
                                         evaluate_bound,
                                         evaluate_grad,
                                         max_offline_ite=max_offline_iters)
                    if bound_i > bound:
                        check = True
                        rho = rho_i
                        improvement = improvement_i
                        den_mise_log = den_mise_log_i
                if not check:
                    den_mise_log = den_mise_log_i
            else:
                if delta_t == 'continuous':
                    grid_size_1d = int(np.ceil(iters_so_far**(1 / k)))
                    if grid_size_1d > grid_size_1d_old:
                        new_grid = True
                        renyi_components_sum = np.zeros(grid_size_1d**d)
                        grid_size_1d_old = grid_size_1d
                rho, improvement, den_mise_log, den_mise, \
                    renyi_components_sum, renyi_bound = \
                    best_of_grid(pi, grid_size_1d, mu_min, mu_max,
                                 grid_dimension, trainable_std,
                                 rho, old_rhos_list,
                                 iters_so_far, mask_iters,
                                 set_parameters, set_parameters_old,
                                 delta_cst, renyi_components_sum,
                                 evaluate_behav, den_mise,
                                 evaluate_behav_last_sample,
                                 evaluate_bound, evaluate_renyi, evaluate_roba,
                                 filename, plot_bound,
                                 plot_ess_profile, delta_t, new_grid)
                new_grid = False
            set_parameters(rho)

        with timed('summaries after'):
            # Sample actor's parameters from hyperpolicy and assign to actor
            if iters_so_far < max_iters:
                theta = pi.resample()
                all_eps['actor_params'][iters_so_far, :] = theta

            if env.spec is not None:
                if env.spec.id == 'LQG1D-v0':
                    mu1_actor = pi.eval_actor_mean([[1]])[0][0]
                    mu1_higher = pi.eval_higher_mean()[0]
                    sigma = pi.eval_higher_std()[0]
                    logger.record_tabular("LQGmu1_actor", mu1_actor)
                    logger.record_tabular("LQGmu1_higher", mu1_higher)
                    logger.record_tabular("LQGsigma_higher", sigma)
                elif env.spec.id == 'MountainCarContinuous-v0':
                    ac1 = pi.eval_actor_mean([[1, 1]])[0][0]
                    mu1_higher = pi.eval_higher_mean()
                    sigma = pi.eval_higher_std()
                    logger.record_tabular("ActionIn1", ac1)
                    logger.record_tabular("MountainCar_mu0_higher", mu1_higher[0])
                    logger.record_tabular("MountainCar_mu1_higher", mu1_higher[1])
                    logger.record_tabular("MountainCar_std0_higher", sigma[0])
                    logger.record_tabular("MountainCar_std1_higher", sigma[1])
            elif env.id is not None:
                if env.id == 'inverted_pendulum':
                    ac1 = pi.eval_actor_mean([[1, 1, 1, 1]])[0][0]
                    mu1_higher = pi.eval_higher_mean()
                    sigma = pi.eval_higher_std()
                    logger.record_tabular("ActionIn1", ac1)
                    logger.record_tabular("InvPendulum_mu0_higher", mu1_higher[0])
                    logger.record_tabular("InvPendulum_mu1_higher", mu1_higher[1])
                    logger.record_tabular("InvPendulum_mu2_higher", mu1_higher[2])
                    logger.record_tabular("InvPendulum_mu3_higher", mu1_higher[3])
                    logger.record_tabular("InvPendulum_std0_higher", sigma[0])
                    logger.record_tabular("InvPendulum_std1_higher", sigma[1])
                    logger.record_tabular("InvPendulum_std2_higher", sigma[2])
                    logger.record_tabular("InvPendulum_std3_higher", sigma[3])
            if find_optimal_arm:
                ret_mean = compute_return_mean(*args)
                logger.record_tabular('ReturnMean', ret_mean)
            else:
                args_losses = args + (den_mise_log, renyi_bound, )
                meanlosses = np.array(compute_losses(*args_losses))
                for (lossname, lossval) in zip(loss_names, meanlosses):
                    logger.record_tabular(lossname, lossval)

        # Print all info in a table
        logger.dump_tabular()

    # Close environment in the end
    env.close()
コード例 #25
0
def learn(env, policy_fn, *,
          timesteps_per_batch,  # what to train on
          epsilon, beta, cg_iters,
          gamma, lam,  # advantage estimation
          trial, sess,
          method,
          entcoeff=0.0,
          cg_damping=1e-2,
          kl_target=0.01,
          crosskl_coeff=0.01,
          vf_stepsize=3e-4,
          vf_iters =3,
          max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
          callback=None,
          TRPO=False
          ):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    total_space = env.total_space
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space, ob_name="ob")
    oldpi = policy_fn("oldpi", ob_space, ac_space, ob_name="ob")

    gpi = policy_fn("gpi", total_space, ac_space, ob_name="gob")
    goldpi = policy_fn("goldpi", total_space, ac_space, ob_name="gob")

    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    gatarg = tf.placeholder(dtype=tf.float32, shape=[None])
    gret = tf.placeholder(dtype=tf.float32, shape=[None])

    ob = U.get_placeholder_cached(name="ob")
    gob = U.get_placeholder_cached(name='gob')
    ac = pi.pdtype.sample_placeholder([None])
    crosskl_c = tf.placeholder(dtype=tf.float32, shape=[])
    # crosskl_c = 0.01


    kloldnew = oldpi.pd.kl(pi.pd)
    gkloldnew = goldpi.pd.kl(gpi.pd)

    #TODO: check if it can work in this way
    # crosskl_ob = pi.pd.kl(goldpi.pd)
    # crosskl_gob = gpi.pd.kl(oldpi.pd)
    crosskl_gob = pi.pd.kl(gpi.pd)
    crosskl_ob = gpi.pd.kl(pi.pd)
    # crosskl


    pdmean = pi.pd.mean
    pdstd = pi.pd.std
    gpdmean = gpi.pd.mean
    gpdstd = gpi.pd.std

    ent = pi.pd.entropy()
    gent = gpi.pd.entropy()

    old_entropy = oldpi.pd.entropy()
    gold_entropy = goldpi.pd.entropy()

    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    meancrosskl = tf.reduce_mean(crosskl_ob)

    # meancrosskl = tf.maximum(tf.reduce_mean(crosskl_ob - 100), 0)

    gmeankl = tf.reduce_mean(gkloldnew)
    gmeanent = tf.reduce_mean(gent)
    gmeancrosskl = tf.reduce_mean(crosskl_gob)

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
    gvferr = tf.reduce_mean(tf.square(gpi.vpred - gret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
    gratio = tf.exp(gpi.pd.logp(ac) - goldpi.pd.logp(ac))

    # Ratio objective
    # surrgain = tf.reduce_mean(ratio * atarg)
    # gsurrgain = tf.reduce_mean(gratio * gatarg)

    # Log objective
    surrgain = tf.reduce_mean(pi.pd.logp(ac) * atarg)
    gsurrgain = tf.reduce_mean(gpi.pd.logp(ac) * gatarg)

    # optimgain = surrgain + crosskl_c * meancrosskl
    optimgain = surrgain
    losses = [optimgain, meankl, meancrosskl, surrgain, meanent, tf.reduce_mean(ratio)]
    loss_names = ["optimgain", "meankl", "meancrosskl", "surrgain", "entropy", "ratio"]

    # goptimgain = gsurrgain + crosskl_c * gmeancrosskl
    goptimgain = gsurrgain

    glosses = [goptimgain, gmeankl, gmeancrosskl, gsurrgain, gmeanent, tf.reduce_mean(gratio)]
    gloss_names = ["goptimgain", "gmeankl","gmeancrosskl", "gsurrgain", "gentropy", "gratio"]

    dist = meankl
    gdist = gmeankl

    all_pi_var_list = pi.get_trainable_variables()
    all_var_list = [v for v in all_pi_var_list if v.name.split("/")[0].startswith("pi")]
    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    vfadam = MpiAdam(vf_var_list)
    poladam = MpiAdam(var_list)


    gall_gpi_var_list = gpi.get_trainable_variables()
    gall_var_list = [v for v in gall_gpi_var_list if v.name.split("/")[0].startswith("gpi")]
    gvar_list = [v for v in gall_var_list if v.name.split("/")[1].startswith("pol")]
    gvf_var_list = [v for v in gall_var_list if v.name.split("/")[1].startswith("vf")]
    gvfadam = MpiAdam(gvf_var_list)
    # gpoladpam = MpiAdam(gvar_list)


    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    # crossklgrads = tf.gradients(meancrosskl, var_list)

    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)


    gget_flat = U.GetFlat(gvar_list)
    gset_from_flat = U.SetFromFlat(gvar_list)
    gklgrads = tf.gradients(gdist, gvar_list)
    # gcrossklgrads = tf.gradients(gmeancrosskl, gvar_list)

    gflat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="gflat_tan")
    gshapes = [var.get_shape().as_list() for var in gvar_list]
    gstart = 0
    gtangents = []
    for shape in gshapes:
        sz = U.intprod(shape)
        gtangents.append(tf.reshape(gflat_tangent[gstart:gstart+sz], shape))
        gstart += sz
    ggvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(gklgrads, gtangents)]) #pylint: disable=E1111
    gfvp = U.flatgrad(ggvp, gvar_list)


    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])

    gassign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(goldpi.get_variables(), gpi.get_variables())])

    compute_losses = U.function([crosskl_c, gob, ob, ac, atarg], losses)
    compute_lossandgrad = U.function([crosskl_c, gob, ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
    compute_crossklandgrad = U.function([ob, gob],U.flatgrad(meancrosskl, var_list))

    gcompute_losses = U.function([crosskl_c, ob, gob, ac, gatarg], glosses)
    gcompute_lossandgrad = U.function([crosskl_c, ob, gob, ac, gatarg], glosses + [U.flatgrad(goptimgain, gvar_list)])
    gcompute_fvp = U.function([gflat_tangent, gob, ac, gatarg], gfvp)
    gcompute_vflossandgrad = U.function([gob, gret], U.flatgrad(gvferr, gvf_var_list))
    # compute_gcrossklandgrad = U.function([gob, ob], U.flatgrad(gmeancrosskl, gvar_list))

    saver = tf.train.Saver()

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()

    guided_initilizer(gpol=gvar_list, gvf=gvf_var_list, fpol=var_list, fvf=vf_var_list)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    poladam.sync()
    print("Init final policy param sum", th_init.sum(), flush=True)

    gth_init = gget_flat()
    MPI.COMM_WORLD.Bcast(gth_init, root=0)
    gset_from_flat(gth_init)
    gvfadam.sync()
    # gpoladpam.sync()
    print("Init guided policy param sum", gth_init.sum(), flush=True)

    # Initialize eta, omega optimizer
    init_eta = 0.5
    init_omega = 2.0
    eta_omega_optimizer = EtaOmegaOptimizer(beta, epsilon, init_eta, init_omega)



    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, gpi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    num_iters = max_timesteps // timesteps_per_batch
    lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************"%iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        gob, gatarg, gtdlamret = seg["gob"], seg["gadv"], seg["gtdlamret"]


        vpredbefore = seg["vpred"] # predicted value function before udpate
        gvpredbefore = seg["gvpred"]

        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        gatarg = (gatarg - gatarg.mean()) / gatarg.std()

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        if hasattr(gpi, "ret_rms"): gpi.ret_rms.update(gtdlamret)
        if hasattr(gpi, "ob_rms"): gpi.ob_rms.update(gob)

        args = crosskl_coeff, seg["gob"], seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args[2:]]

        gargs = crosskl_coeff, seg["ob"], seg["gob"], seg["ac"], gatarg
        gfvpargs = [arr[::5] for arr in gargs[2:]]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        def gfisher_vector_product(p):
            return allmean(gcompute_fvp(p, *gfvpargs)) + cg_damping * p

        assign_old_eq_new() # set old parameter values to new parameter values
        gassign_old_eq_new()

        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
            *glossbefore, gg = gcompute_lossandgrad(*gargs)

        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)

        glossbefore = allmean(np.array(glossbefore))
        gg = allmean(gg)

        if np.allclose(g, 0) or np.allclose(gg, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
                gstepdir = cg(gfisher_vector_product, gg, cg_iters=cg_iters, verbose=rank==0)
            assert np.isfinite(gstepdir).all()
            assert np.isfinite(stepdir).all()


            if TRPO:
                #
                # TRPO specific code.
                # Find correct step size using line search
                #
                #TODO: also enable guided learning for TRPO
                shs = .5*stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / epsilon)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > epsilon * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
            else:
                #
                # COPOS specific implementation.
                #

                copos_update_dir = stepdir
                gcopos_update_dir = gstepdir

                # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts
                w_theta, w_beta = pi.split_w(copos_update_dir)
                gw_theta, gw_beta = gpi.split_w(gcopos_update_dir)

                # q_beta(s,a) = \grad_beta \log \pi(a|s) * w_beta
                #             = features_beta(s) * K^T * Prec * a
                # q_beta = self.target.get_q_beta(features_beta, actions)

                Waa, Wsa = pi.w2W(w_theta)
                wa = pi.get_wa(ob, w_beta)

                gWaa, gWsa = gpi.w2W(gw_theta)
                gwa = gpi.get_wa(gob, gw_beta)

                varphis = pi.get_varphis(ob)
                gvarphis = gpi.get_varphis(gob)

                # Optimize eta and omega
                tmp_ob = np.zeros((1,) + ob_space.shape) # We assume that entropy does not depend on the NN
                old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0]
                eta, omega = eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(),
                                                          pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent)
                logger.log("Initial eta of final policy: " + str(eta) + " and omega: " + str(omega))

                gtmp_ob = np.zeros((1,) + total_space.shape)
                gold_ent = gold_entropy.eval({goldpi.ob: gtmp_ob})[0]
                geta, gomega = eta_omega_optimizer.optimize(gw_theta, gWaa, gWsa, gwa, gvarphis, gpi.get_kt(),
                                                            gpi.get_prec_matrix(), gpi.is_new_policy_valid, gold_ent)
                logger.log("Initial eta of guided policy: " + str(geta) + " and omega: " + str(gomega))

                current_theta_beta = get_flat()
                prev_theta, prev_beta = pi.all_to_theta_beta(current_theta_beta)

                gcurrent_theta_beta = gget_flat()
                gprev_theta, gprev_beta = gpi.all_to_theta_beta(gcurrent_theta_beta)

                for i in range(2):
                    # Do a line search for both theta and beta parameters by adjusting only eta
                    eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi,
                                     epsilon, args)
                    logger.log("Updated eta of final policy, eta: " + str(eta) + " and omega: " + str(omega))

                    # Find proper omega for new eta. Use old policy parameters first.
                    set_from_flat(pi.theta_beta_to_all(prev_theta, prev_beta))
                    eta, omega = \
                        eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(),
                                                     pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta)
                    logger.log("Updated omega of final policy, eta: " + str(eta) + " and omega: " + str(omega))

                    geta = eta_search(gw_theta, gw_beta, geta, gomega, allmean, gcompute_losses, gget_flat,
                                      gset_from_flat, gpi, epsilon, gargs)
                    logger.log("updated eta of guided policy, eta:" + str(geta) + "and omega:" + str(gomega))

                    gset_from_flat(gpi.theta_beta_to_all(gprev_theta, gprev_beta))
                    geta, gomega = eta_omega_optimizer.optimize(gw_theta, gWaa, gWsa, gwa, gvarphis,
                                    gpi.get_kt(), gpi.get_prec_matrix(), gpi.is_new_policy_valid, gold_ent, geta)
                    logger.log("Updated omega of guided policy, eta:" + str(geta) + "and omega:" + str(gomega))

                # Use final policy
                logger.log("Final eta of final policy: " + str(eta) + " and omega: " + str(omega))
                logger.log("Final eta of guided policy: " + str(geta) + "and omega:" + str(gomega))

                cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega)
                cur_beta = prev_beta + w_beta.reshape(-1, ) / eta
                set_from_flat(pi.theta_beta_to_all(cur_theta, cur_beta))

                gcur_theta = (geta * gprev_theta + gw_theta.reshape(-1, )) / (geta + gomega)
                gcur_beta = gprev_beta + gw_beta.reshape(-1, ) / geta
                gset_from_flat(gpi.theta_beta_to_all(gcur_theta, gcur_beta))

                meanlosses = surr, kl, crosskl, *_ = allmean(np.array(compute_losses(*args)))
                gmeanlosses = gsurr, gkl, gcrosskl, *_ = allmean(np.array(gcompute_losses(*gargs)))

                # poladam.update(allmean(compute_crossklandgrad(ob, gob)), vf_stepsize)
                # gpoladpam.update(allmean(compute_gcrossklandgrad(gob, ob)), vf_stepsize)

                for _ in range(vf_iters):
                    for (mbob, mbgob) in dataset.iterbatches((seg["ob"], seg["gob"]),
                        include_final_partial_batch=False, batch_size=64):
                        g = allmean(compute_crossklandgrad(mbob, mbgob))
                        poladam.update(g, vf_stepsize)


        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        for (lossname, lossval) in zip(gloss_names, gmeanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
                include_final_partial_batch=False, batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)
                for (mbob, mbret) in dataset.iterbatches((seg["gob"], seg["gtdlamret"]),
                include_final_partial_batch=False, batch_size=64):
                    gg = allmean(gcompute_vflossandgrad(mbob, mbret))
                    gvfadam.update(gg, vf_stepsize)

        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        logger.record_tabular("gev_tdlam_before", explained_variance(gvpredbefore, gtdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        logger.record_tabular("CrossKLCoeff :", crosskl_coeff)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        logger.record_tabular("Name", method)
        logger.record_tabular("Iteration", iters_so_far)
        logger.record_tabular("trial", trial)

        if rank==0:
            logger.dump_tabular()

        if iters_so_far % 100 == 0 or iters_so_far == 1 or iters_so_far == num_iters:
            # sess = tf.get_default_session()
            checkdir = get_dir(osp.join(logger.get_dir(), 'checkpoints'))
            savepath = osp.join(checkdir, '%.5i.ckpt'%iters_so_far)
            saver.save(sess, save_path=savepath)
            print("save model to path:", savepath)
コード例 #26
0
ファイル: hoof_no_uvfa_npg.py プロジェクト: supratikp/HOOF
def run_hoof_no_lamgam(
        network,
        env,
        total_timesteps,
        timesteps_per_batch,  # what to train on
        kl_range,
        gamma_range,
        lam_range,  # advantage estimation
        num_kl,
        num_gamma_lam,
        cg_iters=10,
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm
    Parameters:
    ----------
    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets
    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class
    timesteps_per_batch     timesteps per gradient estimation batch
    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )
    ent_coef                coefficient of policy entropy term in the optimization objective
    cg_iters                number of iterations of conjugate gradient algorithm
    cg_damping              conjugate gradient damping
    vf_stepsize             learning rate for adam optimizer used to optimie value function loss
    vf_iters                number of iterations of value function optimization iterations per each policy optimization step
    total_timesteps           max number of timesteps
    max_episodes            max number of episodes
    max_iters               maximum number of policy optimization iterations
    callback                function to be called with (locals(), globals()) each policy optimization step
    load_path               str, path to load the model from (default: None, i.e. no model is loaded)
    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
    Returns:
    -------
    learnt model
    '''

    MPI = None
    nworkers = 1
    rank = 0

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    # +2 for gamma, lambda
    ob = tf.placeholder(shape=(None, env.observation_space.shape[0] + 2),
                        dtype=env.observation_space.dtype,
                        name='Ob')
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_ratio = U.function(
        [ob, ac, atarg], ratio)  # IS ratio - used for computing IS weights

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator_with_gl(pi,
                                             env,
                                             timesteps_per_batch,
                                             stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    kl_range = np.atleast_1d(kl_range)
    gamma_range = np.atleast_1d(gamma_range)
    lam_range = np.atleast_1d(lam_range)

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()

        thbefore = get_flat()

        rand_gamma = gamma_range[0] + (
            gamma_range[-1] - gamma_range[0]) * np.random.rand(num_gamma_lam)
        rand_lam = lam_range[0] + (
            lam_range[-1] - lam_range[0]) * np.random.rand(num_gamma_lam)
        rand_kl = kl_range[0] + (kl_range[-1] -
                                 kl_range[0]) * np.random.rand(num_kl)

        opt_polval = -10**8
        est_polval = np.zeros((num_gamma_lam, num_kl))
        ob_lam_gam = []
        tdlamret = []
        vpred = []

        for gl in range(num_gamma_lam):
            oblg, vpredbefore, atarg, tdlr = add_vtarg_and_adv_without_gl(
                pi, seg, rand_gamma[gl], rand_lam[gl])

            ob_lam_gam += [oblg]
            tdlamret += [tdlr]
            vpred += [vpredbefore]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            pol_ob = np.concatenate(
                (seg['ob'], np.zeros(seg['ob'].shape[:-1] + (2, ))), axis=-1)
            args = pol_ob, seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            def fisher_vector_product(p):
                return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=False)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                surrbefore = lossbefore[0]

                for m, kl in enumerate(rand_kl):
                    lm = np.sqrt(shs / kl)
                    fullstep = stepdir / lm
                    thnew = thbefore + fullstep
                    set_from_flat(thnew)

                    # compute the IS estimates
                    lik_ratio = compute_ratio(*args)
                    est_polval[gl, m] = wis_estimate(seg, lik_ratio)

                    # update best policy found so far
                    if est_polval[gl, m] > opt_polval:
                        opt_polval = est_polval[gl, m]
                        opt_th = thnew
                        opt_kl = kl
                        opt_gamma = rand_gamma[gl]
                        opt_lam = rand_lam[gl]
                        opt_vpredbefore = vpredbefore
                        opt_tdlr = tdlr
                        meanlosses = surr, kl, *_ = allmean(
                            np.array(compute_losses(*args)))
                        improve = surr - surrbefore
                        expectedimprove = g.dot(fullstep)
                    set_from_flat(thbefore)
        logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
        set_from_flat(opt_th)

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        ob_lam_gam = np.concatenate(ob_lam_gam, axis=0)
        tdlamret = np.concatenate(tdlamret, axis=0)
        vpred = np.concatenate(vpred, axis=0)
        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (ob_lam_gam, tdlamret),
                        include_final_partial_batch=False,
                        batch_size=num_gamma_lam * 64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpred, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        if MPI is not None:
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        else:
            listoflrpairs = [lrlocal]

        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        logger.record_tabular("Opt_KL", opt_kl)
        logger.record_tabular("gamma", opt_gamma)
        logger.record_tabular("lam", opt_lam)

        if rank == 0:
            logger.dump_tabular()

    return pi
コード例 #27
0
def learn(
        env,
        make_policy,
        *,
        n_episodes,
        horizon,
        delta,
        gamma,
        max_iters,
        sampler=None,
        use_natural_gradient=False,  #can be 'exact', 'approximate'
        fisher_reg=1e-2,
        iw_method='is',
        iw_norm='none',
        bound='J',
        line_search_type='parabola',
        save_weights=0,
        improvement_tol=0.,
        center_return=False,
        render_after=None,
        max_offline_iters=100,
        callback=None,
        clipping=False,
        entropy='none',
        positive_return=False,
        reward_clustering='none',
        capacity=10):

    np.set_printoptions(precision=3)
    max_samples = horizon * n_episodes

    if line_search_type == 'binary':
        line_search = line_search_binary
    elif line_search_type == 'parabola':
        line_search = line_search_parabola
    else:
        raise ValueError()

    # Building the environment
    ob_space = env.observation_space
    ac_space = env.action_space

    # Creating the memory buffer
    memory = Memory(capacity=capacity,
                    batch_size=n_episodes,
                    horizon=horizon,
                    ob_space=ob_space,
                    ac_space=ac_space)

    # Building the target policy and saving its parameters
    pi = make_policy('pi', ob_space, ac_space)
    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split('/')[1].startswith('pol')
    ]
    shapes = [U.intprod(var.get_shape().as_list()) for var in var_list]
    n_parameters = sum(shapes)

    # Building a set of behavioral policies
    behavioral_policies = memory.build_policies(make_policy, pi)

    # Placeholders
    ob_ = ob = U.get_placeholder_cached(name='ob')
    ac_ = pi.pdtype.sample_placeholder([None], name='ac')
    mask_ = tf.placeholder(dtype=tf.float32, shape=(None), name='mask')
    rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='rew')
    disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='disc_rew')
    clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(None))
    gradient_ = tf.placeholder(dtype=tf.float32,
                               shape=(n_parameters, 1),
                               name='gradient')
    iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number')
    active_policies = tf.placeholder(dtype=tf.float32,
                                     shape=(capacity),
                                     name='active_policies')
    losses_with_name = []

    # Total number of trajectories
    N_total = tf.reduce_sum(active_policies) * n_episodes

    # Split operations
    disc_rew_split = tf.reshape(disc_rew_ * mask_, [-1, horizon])
    rew_split = tf.reshape(rew_ * mask_, [-1, horizon])
    mask_split = tf.reshape(mask_, [-1, horizon])

    # Policy densities
    target_log_pdf = pi.pd.logp(ac_) * mask_
    target_log_pdf_split = tf.reshape(target_log_pdf, [-1, horizon])
    behavioral_log_pdfs = tf.stack([
        bpi.pd.logp(ac_) * mask_ for bpi in memory.policies
    ])  # Shape is (capacity, ntraj*horizon)
    behavioral_log_pdfs_split = tf.reshape(behavioral_log_pdfs,
                                           [memory.capacity, -1, horizon])

    # Compute renyi divergencies and sum over time, then exponentiate
    emp_d2_split = tf.reshape(
        tf.stack([pi.pd.renyi(bpi.pd, 2) * mask_ for bpi in memory.policies]),
        [memory.capacity, -1, horizon])
    emp_d2_split_cum = tf.exp(tf.reduce_sum(emp_d2_split, axis=2))
    # Compute arithmetic and harmonic mean of emp_d2
    emp_d2_mean = tf.reduce_mean(emp_d2_split_cum, axis=1)
    emp_d2_arithmetic = tf.reduce_sum(
        emp_d2_mean * active_policies) / tf.reduce_sum(active_policies)
    emp_d2_harmonic = tf.reduce_sum(active_policies) / tf.reduce_sum(
        1 / emp_d2_mean)

    # Return processing: clipping, centering, discounting
    ep_return = clustered_rew_  #tf.reduce_sum(mask_split * disc_rew_split, axis=1)
    if clipping:
        rew_split = tf.clip_by_value(rew_split, -1, 1)
    if center_return:
        ep_return = ep_return - tf.reduce_mean(ep_return)
        rew_split = rew_split - (tf.reduce_sum(rew_split) /
                                 (tf.reduce_sum(mask_split) + 1e-24))
    discounter = [pow(gamma, i) for i in range(0, horizon)]  # Decreasing gamma
    discounter_tf = tf.constant(discounter)
    disc_rew_split = rew_split * discounter_tf

    # Reward statistics
    return_mean = tf.reduce_mean(ep_return)
    return_std = U.reduce_std(ep_return)
    return_max = tf.reduce_max(ep_return)
    return_min = tf.reduce_min(ep_return)
    return_abs_max = tf.reduce_max(tf.abs(ep_return))
    return_step_max = tf.reduce_max(tf.abs(rew_split))  # Max step reward
    return_step_mean = tf.abs(tf.reduce_mean(rew_split))
    positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split))
    negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split))
    return_step_maxmin = tf.abs(positive_step_return_max -
                                negative_step_return_max)
    losses_with_name.extend([(return_mean, 'InitialReturnMean'),
                             (return_max, 'InitialReturnMax'),
                             (return_min, 'InitialReturnMin'),
                             (return_std, 'InitialReturnStd'),
                             (emp_d2_arithmetic, 'EmpiricalD2Arithmetic'),
                             (emp_d2_harmonic, 'EmpiricalD2Harmonic'),
                             (return_step_max, 'ReturnStepMax'),
                             (return_step_maxmin, 'ReturnStepMaxmin')])

    if iw_method == 'is':
        # Sum the log prob over time. Shapes: target(Nep, H), behav (Cap, Nep, H)
        target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1)
        behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdfs_split,
                                                   axis=2)
        # To avoid numerical instability, compute the inversed ratio
        log_inverse_ratio = behavioral_log_pdf_episode - target_log_pdf_episode
        abc = tf.exp(log_inverse_ratio) * tf.expand_dims(active_policies, -1)
        iw = 1 / tf.reduce_sum(
            tf.exp(log_inverse_ratio) * tf.expand_dims(active_policies, -1),
            axis=0)

        # Get the probability by exponentiation
        #target_pdf_episode = tf.exp(target_log_pdf_episode)
        #behavioral_pdf_episode = tf.exp(behavioral_log_pdf_episode)
        # Get the denominator by averaging over behavioral policies
        #behavioral_pdf_mixture = tf.reduce_mean(behavioral_pdf_episode, axis=0) + 1e-24
        #iw = target_pdf_episode / behavioral_pdf_mixture
        iwn = iw / n_episodes

        # Compute the J
        w_return_mean = tf.reduce_sum(ep_return * iwn)
        # Empirical D2 of the mixture and relative ESS
        ess_renyi_arithmetic = N_total / emp_d2_arithmetic
        ess_renyi_harmonic = N_total / emp_d2_harmonic
        # Log quantities
        losses_with_name.extend([
            (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'),
            (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'),
            (tf.reduce_min(target_log_pdf_episode), 'MinTargetPdf'),
            (tf.reduce_min(behavioral_log_pdf_episode), 'MinBehavPdf'),
            (ess_renyi_arithmetic, 'ESSRenyiArithmetic'),
            (ess_renyi_harmonic, 'ESSRenyiHarmonic')
        ])
    else:
        raise NotImplementedError()

    if bound == 'J':
        bound_ = w_return_mean
    elif bound == 'max-d2-harmonic':
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / (delta * ess_renyi_harmonic)) * return_abs_max
    elif bound == 'max-d2-arithmetic':
        bound_ = w_return_mean - tf.sqrt(
            (1 - delta) / (delta * ess_renyi_arithmetic)) * return_abs_max
    else:
        raise NotImplementedError()

    # Policy entropy for exploration
    ent = pi.pd.entropy()
    meanent = tf.reduce_mean(ent)
    losses_with_name.append((meanent, 'MeanEntropy'))
    # Add policy entropy bonus
    if entropy != 'none':
        scheme, v1, v2 = entropy.split(':')
        if scheme == 'step':
            entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1),
                               lambda: float(0.0))
            losses_with_name.append((entcoeff, 'EntropyCoefficient'))
            entbonus = entcoeff * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'lin':
            ip = tf.cast(iter_number_ / max_iters, tf.float32)
            entcoeff_decay = tf.maximum(
                0.0,
                float(v2) + (float(v1) - float(v2)) * (1.0 - ip))
            losses_with_name.append((entcoeff_decay, 'EntropyCoefficient'))
            entbonus = entcoeff_decay * meanent
            bound_ = bound_ + entbonus
        elif scheme == 'exp':
            ent_f = tf.exp(
                -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1)
            losses_with_name.append((ent_f, 'EntropyCoefficient'))
            bound_ = bound_ + ent_f * meanent
        else:
            raise Exception('Unrecognized entropy scheme.')

    losses_with_name.append((w_return_mean, 'ReturnMeanIW'))
    losses_with_name.append((bound_, 'Bound'))
    losses, loss_names = map(list, zip(*losses_with_name))
    '''
    if use_natural_gradient:
        p = tf.placeholder(dtype=tf.float32, shape=[None])
        target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1)
        grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list)
        dot_product = tf.reduce_sum(grad_logprob * p)
        hess_logprob = U.flatgrad(dot_product, var_list)
        compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob])
    '''

    assert_ops = tf.group(*tf.get_collection('asserts'))
    print_ops = tf.group(*tf.get_collection('prints'))

    compute_lossandgrad = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_grad = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], [U.flatgrad(bound_, var_list), assert_ops, print_ops])
    compute_bound = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], [bound_, assert_ops, print_ops])
    compute_losses = U.function([
        ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_,
        active_policies
    ], losses)
    #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [log_inverse_ratio, abc, iw])

    set_parameter = U.SetFromFlat(var_list)
    get_parameter = U.GetFlat(var_list)

    if sampler is None:
        seg_gen = traj_segment_generator(pi,
                                         env,
                                         n_episodes,
                                         horizon,
                                         stochastic=True,
                                         gamma=gamma)
        sampler = type("SequentialSampler", (object, ), {
            "collect": lambda self, _: seg_gen.__next__()
        })()

    U.initialize()

    # Starting optimizing
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=n_episodes)
    rewbuffer = deque(maxlen=n_episodes)

    while True:

        iters_so_far += 1

        if render_after is not None and iters_so_far % render_after == 0:
            if hasattr(env, 'render'):
                render(env, pi, horizon)

        if callback:
            callback(locals(), globals())

        if iters_so_far >= max_iters:
            print('Finished...')
            break

        logger.log('********** Iteration %i ************' % iters_so_far)

        theta = get_parameter()

        with timed('sampling'):
            seg = sampler.collect(theta)

        lens, rets = seg['ep_lens'], seg['ep_rets']
        lenbuffer.extend(lens)
        rewbuffer.extend(rets)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)

        # Adding batch of trajectories to memory
        memory.add_trajectory_batch(seg)

        # Get multiple batches from memory
        seg_with_memory = memory.get_trajectories()

        # Get clustered reward
        reward_matrix = np.reshape(
            seg_with_memory['disc_rew'] * seg_with_memory['mask'],
            (-1, horizon))
        ep_reward = np.sum(reward_matrix, axis=1)
        ep_reward = cluster_rewards(ep_reward, reward_clustering)

        args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number, active_policies = (
            seg_with_memory['ob'], seg_with_memory['ac'],
            seg_with_memory['rew'], seg_with_memory['disc_rew'], ep_reward,
            seg_with_memory['mask'], iters_so_far,
            memory.get_active_policies_mask())

        def evaluate_loss():
            loss = compute_bound(*args)
            return loss[0]

        def evaluate_gradient():
            gradient = compute_grad(*args)
            return gradient[0]

        if use_natural_gradient:

            def evaluate_fisher_vector_prod(x):
                return compute_linear_operator(x, *args)[0] + fisher_reg * x

            def evaluate_natural_gradient(g):
                return cg(evaluate_fisher_vector_prod,
                          g,
                          cg_iters=10,
                          verbose=0)
        else:
            evaluate_natural_gradient = None

        with timed('summaries before'):
            logger.record_tabular("Iteration", iters_so_far)
            logger.record_tabular("InitialBound", evaluate_loss())
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)

        if save_weights > 0 and iters_so_far % save_weights == 0:
            logger.record_tabular('Weights', str(get_parameter()))
            import pickle
            file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb')
            pickle.dump(theta, file)

        with timed("offline optimization"):
            theta, improvement = optimize_offline(
                theta,
                set_parameter,
                line_search,
                evaluate_loss,
                evaluate_gradient,
                evaluate_natural_gradient,
                max_offline_ite=max_offline_iters)

        set_parameter(theta)

        with timed('summaries after'):
            meanlosses = np.array(compute_losses(*args))
            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

        logger.dump_tabular()

    env.close()
コード例 #28
0
def learn(*,
        network,
        env,
        total_timesteps,
        timesteps_per_batch=1024, # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0, # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters =3,
        max_episodes=0, max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        **network_kwargs
        ):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''


    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()

    cpus_per_worker = 1
    U.get_session(config=tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=cpus_per_worker,
            intra_op_parallelism_threads=cpus_per_worker
    ))


    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards

    if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0:
        # noththing to be done
        return pi

    assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************"%iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]
        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new() # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
            assert np.isfinite(stepdir).all()
            shs = .5*stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
                assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
                include_final_partial_batch=False, batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank==0:
            logger.dump_tabular()

    return pi
コード例 #29
0
ファイル: pois2.py プロジェクト: Umbi14/pois
def learn(env, make_policy, *,
          n_episodes,
          horizon,
          delta,
          gamma,
          max_iters,
          use_natural_gradient=False, #can be 'exact', 'approximate'
          fisher_reg=1e-2,
          iw_method='is',
          iw_norm='none',
          bound='J',
          line_search_type='parabola',
          save_weights=False,
          improvement_tol=0.,
          center_return=False,
          render_after=None,
          max_offline_iters=100,
          callback=None):

    np.set_printoptions(precision=3)
    max_samples = horizon * n_episodes

    if line_search_type == 'binary':
        line_search = line_search_binary
    elif line_search_type == 'parabola':
        line_search = line_search_parabola
    else:
        raise ValueError()

    # Building the environment
    ob_space = env.observation_space
    ac_space = env.action_space

    # Building the policy
    pi = make_policy('pi', ob_space, ac_space)
    oldpi = make_policy('oldpi', ob_space, ac_space)

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.split('/')[1].startswith('pol')]

    shapes = [U.intprod(var.get_shape().as_list()) for var in var_list]
    n_parameters = sum(shapes)

    # Placeholders
    ob_ = ob = U.get_placeholder_cached(name='ob')
    ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac')
    mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask')
    disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='disc_rew')
    gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient')

    # Policy densities
    target_log_pdf = pi.pd.logp(ac_)
    behavioral_log_pdf = oldpi.pd.logp(ac_)
    log_ratio = target_log_pdf - behavioral_log_pdf

    # Split operations
    disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes))
    log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes))
    target_log_pdf_split = tf.stack(tf.split(target_log_pdf * mask_, n_episodes))
    mask_split = tf.stack(tf.split(mask_, n_episodes))

    # Renyi divergence
    emp_d2_split = tf.stack(tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes))
    emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1)
    empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split))

    # Return
    ep_return = tf.reduce_sum(mask_split * disc_rew_split, axis=1)
    if center_return:
        ep_return = ep_return - tf.reduce_mean(ep_return)

    return_mean = tf.reduce_mean(ep_return)
    return_std = U.reduce_std(ep_return)
    return_max = tf.reduce_max(ep_return)
    return_min = tf.reduce_min(ep_return)
    return_abs_max = tf.reduce_max(tf.abs(ep_return))

    if iw_method == 'pdis':
        raise NotImplementedError()
    elif iw_method == 'is':
        iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1))
        if iw_norm == 'none':
            iwn = iw / n_episodes
            w_return_mean = tf.reduce_sum(iwn * ep_return)
        elif iw_norm == 'sn':
            iwn = iw / tf.reduce_sum(iw)
            w_return_mean = tf.reduce_sum(iwn * ep_return)
        elif iw_norm == 'regression':
            iwn = iw / n_episodes
            mean_iw = tf.reduce_mean(iw)
            beta = tf.reduce_sum((iw - mean_iw) * ep_return * iw) / (tf.reduce_sum((iw - mean_iw) ** 2) + 1e-24)
            w_return_mean = tf.reduce_mean(iw * ep_return - beta * (iw - 1))
        else:
            raise NotImplementedError()

        ess_classic = tf.linalg.norm(iw, 1) ** 2 / tf.linalg.norm(iw, 2) ** 2
        sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2)
        ess_renyi = n_episodes / empirical_d2
    else:
        raise NotImplementedError()

    if bound == 'J':
        bound_ = w_return_mean
    elif bound == 'std-d2':
        bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_std
    elif bound == 'max-d2':
        bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_abs_max
    elif bound == 'max-ess':
        bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_abs_max
    elif bound == 'std-ess':
        bound_ = w_return_mean - tf.sqrt((1 - delta) / delta) / sqrt_ess_classic * return_std
    else:
        raise NotImplementedError()

    losses = [bound_, return_mean, return_max, return_min, return_std, empirical_d2, w_return_mean,
              tf.reduce_max(iwn), tf.reduce_min(iwn), tf.reduce_mean(iwn), U.reduce_std(iwn), tf.reduce_max(iw),
              tf.reduce_min(iw), tf.reduce_mean(iw), U.reduce_std(iw), ess_classic, ess_renyi]
    loss_names = ['Bound', 'InitialReturnMean', 'InitialReturnMax', 'InitialReturnMin', 'InitialReturnStd',
                  'EmpiricalD2', 'ReturnMeanIW', 'MaxIWNorm', 'MinIWNorm', 'MeanIWNorm', 'StdIWNorm',
                  'MaxIW', 'MinIW', 'MeanIW', 'StdIW', 'ESSClassic', 'ESSRenyi']

    if use_natural_gradient:
        p = tf.placeholder(dtype=tf.float32, shape=[None])
        target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1)
        grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list)
        dot_product = tf.reduce_sum(grad_logprob * p)
        hess_logprob = U.flatgrad(dot_product, var_list)
        compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob])


    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])

    compute_lossandgrad = U.function([ob_, ac_, disc_rew_, mask_], losses + [U.flatgrad(bound_, var_list)])
    compute_grad = U.function([ob_, ac_, disc_rew_, mask_], [U.flatgrad(bound_, var_list)])
    compute_bound = U.function([ob_, ac_, disc_rew_, mask_], [bound_])
    compute_losses = U.function([ob_, ac_, disc_rew_, mask_], losses)

    set_parameter = U.SetFromFlat(var_list)
    get_parameter = U.GetFlat(var_list)

    seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True, gamma=gamma)
    sampler = type("SequentialSampler", (object,), {"collect": lambda self, _: seg_gen.__next__()})()

    U.initialize()

    # Starting optimizing

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=n_episodes)
    rewbuffer = deque(maxlen=n_episodes)

    while True:

        iters_so_far += 1

        if render_after is not None and iters_so_far % render_after == 0:
            if hasattr(env, 'render'):
                render(env, pi, horizon)

        if callback:
            callback(locals(), globals())

        if iters_so_far >= max_iters:
            print('Finised...')
            break

        logger.log('********** Iteration %i ************' % iters_so_far)

        theta = get_parameter()

        with timed('sampling'):
            seg = sampler.collect(theta)

        lens, rets = seg['ep_lens'], seg['ep_rets']

        lenbuffer.extend(lens)
        rewbuffer.extend(rets)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)

        args = ob, ac, disc_rew, mask = seg['ob'], seg['ac'], seg['disc_rew'], seg['mask']

        assign_old_eq_new()

        def evaluate_loss():
            loss = compute_bound(*args)
            return loss[0]

        def evaluate_gradient():
            gradient = compute_grad(*args)
            return gradient[0]

        if use_natural_gradient:
            def evaluate_fisher_vector_prod(x):
                return compute_linear_operator(x, *args)[0] + fisher_reg * x

            def evaluate_natural_gradient(g):
                return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0)
        else:
            evaluate_natural_gradient = None

        with timed('summaries before'):
            logger.record_tabular("Itaration", iters_so_far)
            logger.record_tabular("InitialBound", evaluate_loss())
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)

        if save_weights:
            logger.record_tabular('Weights', str(get_parameter()))

        with timed("offline optimization"):

            theta, improvement = optimize_offline(theta,
                                                  set_parameter,
                                                  line_search,
                                                  evaluate_loss,
                                                  evaluate_gradient,
                                                  evaluate_natural_gradient,
                                                  max_offline_ite=max_offline_iters)

        set_parameter(theta)

        with timed('summaries after'):
            meanlosses = np.array(compute_losses(*args))
            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

        logger.dump_tabular()


    env.close()
コード例 #30
0
def learn(
        base_env,
        policy_fn,
        *,
        max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        timesteps_per_actorbatch,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        seed=0):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = base_env.observation_space
    ac_space = base_env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution

    var_list = pi.get_trainable_variables()
    layer_var_list = []
    for i in range(pi.num_hid_layers):
        layer_var_list.append([
            v for v in var_list
            if v.name.split("/")[2].startswith('fc%i' % (i + 1))
        ])
    logstd_var_list = [
        v for v in var_list if v.name.split("/")[2].startswith("logstd")
    ]
    if len(logstd_var_list) != 0:
        layer_var_list.append(
            [v for v in var_list if v.name.split("/")[2].startswith("final")] +
            logstd_var_list)

    U.initialize()
    layer_set_operate_list = []
    layer_get_operate_list = []
    for var in layer_var_list:
        layer_set_operate_list.append(U.SetFromFlat(var))
        layer_get_operate_list.append(U.GetFlat(var))

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, best_fitness
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Build generator for all solutions
    seg_gen = traj_segment_generator_eval(backup_pi,
                                          base_env,
                                          timesteps_per_actorbatch,
                                          stochastic=True)
    actors = []
    for i in range(popsize):
        newActor = traj_segment_generator(pi,
                                          base_env,
                                          timesteps_per_actorbatch,
                                          stochastic=True,
                                          eval_iters=eval_iters,
                                          seg_gen=seg_gen)
        actors.append(newActor)
    best_fitness = -np.inf
    opt = cma.CMAOptions()
    opt['tolfun'] = max_fitness
    opt['popsize'] = popsize
    opt['maxiter'] = gensize
    opt['verb_disp'] = 0
    opt['verb_log'] = 0
    # opt['seed'] = seed
    opt['AdaptSigma'] = True
    # opt['bounds'] = bounds
    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            logger.log("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            logger.log("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            logger.log("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            logger.log("Max time")
            break

        # Linearly decay the exploration
        sigma_adapted = max(sigma - float(timesteps_so_far) / max_timesteps, 0)

        logger.log("********** Iteration %i ************" % iters_so_far)
        eval_seg = seg_gen.__next__()
        rewbuffer.extend(eval_seg["ep_rets"])
        lenbuffer.extend(eval_seg["ep_lens"])
        if iters_so_far == 0:
            result_record()

        for i in range(len(layer_var_list)):
            assign_backup_eq_new()  # backup current policy
            logger.log("Current Layer:" + str(layer_var_list[i]))
            flatten_weights = layer_get_operate_list[i]()
            es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt)
            costs = None
            best_solution = None

            die_out_count = 0
            while True:
                if es.countiter >= gensize:
                    logger.log("Max generations for current layer")
                    break
                solutions = es.ask()
                ob_segs = None
                segs = []
                costs = []
                lens = []
                for id, solution in enumerate(solutions):
                    layer_set_operate_list[i](solution)
                    seg = actors[id].__next__()
                    costs.append(-np.mean(seg["ep_rets"]))
                    lens.append(np.sum(seg["ep_lens"]))
                    segs.append(seg)
                    if ob_segs is None:
                        ob_segs = {'ob': np.copy(seg['ob'])}
                    else:
                        ob_segs['ob'] = np.append(ob_segs['ob'],
                                                  seg['ob'],
                                                  axis=0)
                    assign_new_eq_backup()
                # Weights decay
                l2_decay = compute_weight_decay(0.01, solutions)
                costs += l2_decay
                costs, real_costs = fitness_normalization(costs)
                es.tell_real_seg(solutions=solutions,
                                 function_values=costs,
                                 real_f=real_costs,
                                 segs=segs)
                best_solution = np.copy(es.result[0])
                best_fitness = -es.result[1]
                rewbuffer.extend(es.result[3]["ep_rets"])
                lenbuffer.extend(es.result[3]["ep_lens"])
                layer_set_operate_list[i](best_solution)
                logger.log("Update the layer")
                logger.log("Generation:", es.countiter)
                logger.log("Best Solution Fitness:", best_fitness)

                ob = ob_segs["ob"]
                if hasattr(pi, "ob_rms"):
                    pi.ob_rms.update(
                        ob
                    )  # update running mean/std for observation normalization
                episodes_so_far += sum(lens)
            es = None
            import gc
            gc.collect()
        iters_so_far += 1