コード例 #1
0
def dSAC(act,
         policy_logits,
         rewards,
         qf1,
         qf2,
         vf,
         next_vf_target,
         ALPHA,
         GAMMA,
         normalize_advantage=False):
    log_p = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=act, logits=policy_logits)
    v_loss = mse(vf, tf.stop_gradient(tf.minimum(qf1, qf2) - log_p / ALPHA))
    qf_target = tf.stop_gradient(rewards + GAMMA * next_vf_target)
    q1_loss = mse(qf1, qf_target)
    if qf2 is None:
        q2_loss = 0.
    else:
        q2_loss = mse(qf2, qf_target)
    q_loss = q1_loss + q2_loss
    advantage = qf1 - vf - log_p / ALPHA
    adv_mean = tf.reduce_mean(advantage)
    tf.summary.scalar("adv_mean", adv_mean)
    advantage_center = advantage - adv_mean
    adv_std = tf.sqrt(tf.reduce_mean(advantage_center**2))
    tf.summary.scalar("adv_std", adv_std)
    if normalize_advantage:
        advantage = advantage_center / tf.maximum(adv_std, 1e-12)
    p_loss = -log_p * tf.stop_gradient(advantage)
    return SACloss(p_loss, q_loss, v_loss)
コード例 #2
0
def dPPOcC(act, policy_logits, behavior_logits, advantage, policy_clip, vf,
           vf_target, value_clip, old_vf):
    a_loss = dPPOc(act=act,
                   policy_logits=policy_logits,
                   behavior_logits=behavior_logits,
                   advantage=advantage,
                   clip=policy_clip)
    c_loss = mse(y_hat=vf,
                 y_target=vf_target,
                 clip=value_clip,
                 clip_center=old_vf)
    return PPOcCloss(a_loss, c_loss)
コード例 #3
0
def dAC(act,
        policy_logits,
        advantage,
        vf,
        vf_target,
        value_clip=None,
        old_vf=None):
    ros = IS_from_logits(policy_logits=policy_logits, act=act)
    a_loss = -advantage * ros
    c_loss = mse(y_hat=vf,
                 y_target=vf_target,
                 clip=value_clip,
                 clip_center=old_vf)
    return ACloss(a_loss, c_loss)
コード例 #4
0
ファイル: Trainer_r2d2.py プロジェクト: hybug/test_ppo
def build_learner(pre, post, act_space, num_frames):
    global_step = tf.train.get_or_create_global_step()
    init_lr = FLAGS.init_lr
    decay = FLAGS.lr_decay
    warmup_steps = FLAGS.warmup_steps

    global_step_float = tf.cast(global_step, tf.float32)

    lr = tf.train.polynomial_decay(
        init_lr, global_step, FLAGS.total_environment_frames *
        FLAGS.num_replay // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.)
    is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32)
    lr = is_warmup * global_step_float / warmup_steps * init_lr + (
        1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay)

    if FLAGS.opt == "adam":
        optimizer = tf.train.AdamOptimizer(lr)
    else:
        optimizer = tf.train.RMSPropOptimizer(lr, epsilon=0.01)

    if FLAGS.zero_init:
        pre["state_in"] = tf.zeros_like(pre["state_in"])

    lstm = tf.compat.v1.keras.layers.LSTM(256,
                                          return_sequences=True,
                                          return_state=True,
                                          name="lstm")
    pre_model = Model(act_space, lstm, "agent", **pre)

    post["state_in"] = tf.stop_gradient(pre_model.state_out)

    post_model = Model(act_space, lstm, "agent", **post)

    if FLAGS.rescale:
        target = rescaleTarget(post_model.r, FLAGS.gamma**FLAGS.n_step,
                               post_model.qa1)
    else:
        target = post_model.r + FLAGS.gamma**FLAGS.n_step * post_model.qa1

    loss = 100. * tf.reduce_mean(
        post_model.slots * mse(post_model.qa, tf.stop_gradient(target)))

    exp_td = post_model.slots * tf.math.pow(
        tf.abs(post_model.qa -
               (post_model.r + FLAGS.gamma**FLAGS.n_step * post_model.qa1)),
        0.9)

    avg_p = tf.reduce_sum(exp_td, axis=-1) / (tf.reduce_sum(post_model.slots,
                                                            axis=-1))
    max_p = tf.reduce_max(exp_td, axis=-1)

    priority = 0.9 * max_p + 0.1 * avg_p
    priority = tf.cast(-10000 * priority, tf.int64)

    train_op = miniOp(optimizer, loss, FLAGS.grad_clip)

    init_target_op = assignOp(
        1.0, {post_model.scope + "_current": post_model.scope + "_target"})
    if FLAGS.smooth_update:
        assign_op = assignOp(
            1.0 / FLAGS.target_update,
            {post_model.scope + "_current": post_model.scope + "_target"})
        dependency = [train_op, assign_op]
    else:
        dependency = [train_op]

    new_frames = tf.reduce_sum(post["slots"])

    with tf.control_dependencies(dependency):
        global_step_and_train = tf.assign_add(global_step, 1)
        num_frames_and_train = tf.assign_add(num_frames, new_frames)

    tf.summary.scalar("learning_rate", lr)
    tf.summary.scalar("all_loss", loss)

    return (num_frames_and_train, global_step_and_train, init_target_op,
            priority)
コード例 #5
0
def build_learner(pre, post, ws, act_space, num_frames):
    global_step = tf.train.get_or_create_global_step()
    init_lr = FLAGS.init_lr
    decay = FLAGS.lr_decay
    warmup_steps = FLAGS.warmup_steps
    gamma = FLAGS.gamma
    n_step = FLAGS.n_step
    use_soft = FLAGS.use_soft
    time_scale = FLAGS.time_scale
    use_hrnn = FLAGS.use_hrnn
    use_reward_prediction = FLAGS.use_reward_prediction
    after_rnn = FLAGS.after_rnn
    use_pixel_control = FLAGS.use_pixel_control
    pq_kl_coef = FLAGS.pq_kl_coef
    p_kl_coef = FLAGS.p_kl_coef

    global_step_float = tf.cast(global_step, tf.float32)

    lr = tf.train.polynomial_decay(
        init_lr, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        init_lr / 10.)
    is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32)
    lr = is_warmup * global_step_float / warmup_steps * init_lr + (
        1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay)
    optimizer = tf.train.AdamOptimizer(lr)

    if FLAGS.zero_init:
        pre["state_in"] = tf.zeros_like(pre["state_in"])

    if use_hrnn:
        rnn = TmpHierRNN(time_scale,
                         64,
                         4,
                         2,
                         8,
                         'lstm',
                         'rmc',
                         return_sequences=True,
                         return_state=True,
                         name="hrnn")
    else:
        rnn = tf.compat.v1.keras.layers.LSTM(256,
                                             return_sequences=True,
                                             return_state=True,
                                             name="lstm")

    pre_model = Model(act_space, gamma, n_step, use_soft, rnn, use_hrnn,
                      use_reward_prediction, after_rnn, use_pixel_control,
                      False, **pre)

    post["state_in"] = tf.stop_gradient(pre_model.state_out)

    post_model = Model(act_space, gamma, n_step, use_soft, rnn, use_hrnn,
                       use_reward_prediction, after_rnn, use_pixel_control,
                       True, **post)

    v_loss = mse(
        post_model.qa,
        tf.stop_gradient(
            rescaleTarget(post_model.n_step_rewards, gamma**n_step,
                          post_model.qa1)))
    v_loss = FLAGS.vf_coef * tf.reduce_mean(
        v_loss * post_model.mask[:, :-n_step] * ws[:, None])

    add_loss = 0.0
    if use_hrnn:
        pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas,
                                       post_model.p_mus, post_model.p_sigmas)
        pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.mask)

        p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas,
                                      tf.zeros_like(post_model.p_mus),
                                      0.01 * tf.ones_like(post_model.p_sigmas))
        p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.mask)

        with tf.name_scope("hierarchy_loss"):
            tf.summary.scalar("kl_div_pq", pq_kl_loss)
            tf.summary.scalar("kl_div_prior", p_kl_loss)
        add_loss += pq_kl_coef * pq_kl_loss
        add_loss += p_kl_coef * p_kl_loss

    if use_reward_prediction:
        r_loss = tf.reduce_mean(
            mse(post_model.reward_prediction, post_model.r[:, 1:1 - n_step]) *
            post_model.mask[:, :-n_step])
        tf.summary.scalar("r_loss", r_loss)
        add_loss += r_loss

    if use_pixel_control:
        s = tf.cast(post_model.s[:, :1 - n_step, :, :, :], tf.float32) / 255.0
        target = s[:, 1:, :, :, :] - s[:, :-1, :, :, :]
        shape = get_shape(target)
        target = tf.reshape(
            target,
            (shape[0], shape[1], 4, shape[2] // 4, 4, shape[3] // 4, shape[4]))
        target = tf.reduce_mean(target, axis=(2, 4))
        pixel_loss = tf.reduce_mean(
            mse(post_model.pixel_control, target) *
            post_model.mask[:, :-n_step, None, None, None])
        with tf.name_scope("control_loss"):
            tf.summary.scalar("pixel_control_loss", pixel_loss)
        add_loss += pixel_loss

    loss = FLAGS.vf_coef * v_loss + add_loss

    abs_td = post_model.mask[:, :-n_step] * tf.abs(
        post_model.qa - rescaleTarget(post_model.n_step_rewards, gamma**n_step,
                                      post_model.qa1))
    avg_p = tf.reduce_mean(abs_td, axis=-1)
    max_p = tf.reduce_max(abs_td, axis=-1)
    priority = 0.9 * max_p + 0.1 * avg_p

    beta = tf.train.polynomial_decay(
        0.4, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        1.0)

    train_op = miniOp(optimizer, loss, FLAGS.grad_clip)

    target_op = assignOp(1.0, {"q": "q_target"})

    dependency = [train_op]
    if use_soft:
        qf_entropy = entropy_from_logits(post_model.qf_logits)
        target_entropy = tf.train.polynomial_decay(
            0.9 * np.log(act_space), global_step,
            FLAGS.total_environment_frames //
            (FLAGS.batch_size * FLAGS.seqlen), 0.5 * np.log(act_space))
        ent_loss = tf.reduce_mean(
            mse(qf_entropy,
                tf.cast(target_entropy, tf.float32)[None, None]))
        with tf.name_scope("ent_loss"):
            tf.summary.scalar("ent_loss", ent_loss)
        ent_op = miniOp(optimizer,
                        ent_loss,
                        grad_clip=FLAGS.grad_clip,
                        var_scope="temperature")
        dependency.append(ent_op)

    new_frames = tf.reduce_sum(post["mask"])

    with tf.control_dependencies(dependency):
        num_frames_and_train = tf.assign_add(num_frames, new_frames)
        global_step_and_train = tf.assign_add(global_step, 1)

    tf.summary.scalar("learning_rate", lr)
    tf.summary.scalar("v_loss", v_loss)
    tf.summary.scalar("all_loss", loss)

    return num_frames_and_train, global_step_and_train, target_op, priority, beta
コード例 #6
0
def build_learner(pre, post, act_space, num_frames):
    global_step = tf.train.get_or_create_global_step()
    init_lr = FLAGS.init_lr
    decay = FLAGS.lr_decay
    warmup_steps = FLAGS.warmup_steps
    use_rmc = FLAGS.use_rmc
    use_hrmc = FLAGS.use_hrmc
    use_hrnn = FLAGS.use_hrnn
    use_icm = FLAGS.use_icm
    use_coex = FLAGS.use_coex
    use_reward_prediction = FLAGS.use_reward_prediction
    after_rnn = FLAGS.after_rnn
    use_pixel_control = FLAGS.use_pixel_control
    use_pixel_reconstruction = FLAGS.use_pixel_reconstruction
    pq_kl_coef = FLAGS.pq_kl_coef
    p_kl_coef = FLAGS.p_kl_coef

    global_step_float = tf.cast(global_step, tf.float32)

    lr = tf.train.polynomial_decay(
        init_lr, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        init_lr / 10.)
    is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32)
    lr = is_warmup * global_step_float / warmup_steps * init_lr + (
        1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay)
    optimizer = tf.train.AdamOptimizer(lr)

    ent_coef = tf.train.polynomial_decay(
        FLAGS.ent_coef, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        FLAGS.ent_coef / 10.)

    if FLAGS.zero_init:
        pre["state_in"] = tf.zeros_like(pre["state_in"])

    if use_hrnn:
        rnn = TmpHierRNN(4,
                         64,
                         4,
                         2,
                         8,
                         'lstm',
                         'rmc',
                         return_sequences=True,
                         return_state=True,
                         name="hrnn")
    elif use_hrmc:
        rnn = TmpHierRMCRNN(4,
                            64,
                            4,
                            4,
                            return_sequences=True,
                            return_state=True,
                            name="hrmcrnn")
    elif use_rmc:
        rnn = RMCRNN(64,
                     4,
                     4,
                     return_sequences=True,
                     return_state=True,
                     name="rmcrnn")
    else:
        rnn = tf.compat.v1.keras.layers.LSTM(256,
                                             return_sequences=True,
                                             return_state=True,
                                             name="lstm")
    pre_model = Model(act_space, rnn, use_rmc, use_hrmc or use_hrnn,
                      use_reward_prediction, after_rnn,
                      use_pixel_reconstruction, "agent", **pre)

    post["state_in"] = tf.stop_gradient(pre_model.state_out)

    post_model = Model(act_space, rnn, use_rmc, use_hrmc or use_hrnn,
                       use_reward_prediction, after_rnn,
                       use_pixel_reconstruction, "agent", **post)

    tf.summary.scalar("adv_mean", post_model.adv_mean)
    tf.summary.scalar("adv_std", post_model.adv_std)

    losses = dPPOcC(act=post_model.a_t,
                    policy_logits=post_model.current_act_logits,
                    old_policy_logits=post_model.old_act_logits,
                    advantage=post_model.advantage,
                    policy_clip=FLAGS.ppo_clip,
                    vf=post_model.current_value,
                    vf_target=post_model.ret,
                    value_clip=FLAGS.vf_clip,
                    old_vf=post_model.old_current_value)

    entropy_loss = tf.reduce_mean(
        entropy(post_model.current_act_logits) * post_model.slots)

    p_loss = tf.reduce_mean(losses.p_loss * post_model.slots)
    v_loss = tf.reduce_mean(losses.v_loss * post_model.slots)

    add_loss = 0.0
    if use_icm:
        icmloss = icm(post_model.cnn_feature[:, :-1, :],
                      post_model.cnn_feature[:, 1:, :], post_model.a_t[:, :-1],
                      act_space)
        add_loss += 0.2 * tf.reduce_mean(
            icmloss.f_loss * post_model.slots[:, :-1]) + 0.8 * tf.reduce_mean(
                icmloss.i_loss * post_model.slots[:, :-1])
    if use_coex:
        coexloss = coex(post_model.image_feature[:, :-1, :, :, :],
                        post_model.image_feature[:, 1:, :, :, :],
                        post_model.a_t[:, :-1], act_space)
        add_loss += tf.reduce_mean(coexloss * post_model.slots[:, :-1])
    if use_hrmc or use_hrnn:
        pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas,
                                       post_model.p_mus, post_model.p_sigmas)
        pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.slots)
        tf.summary.scalar("kl_div", pq_kl_loss)
        add_loss += pq_kl_coef * pq_kl_loss

        p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas,
                                      tf.zeros_like(post_model.p_mus),
                                      0.01 * tf.ones_like(post_model.p_sigmas))
        p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.slots)
        tf.summary.scalar("kl_div_prior", p_kl_loss)
        add_loss += p_kl_coef * p_kl_loss
    if use_reward_prediction:
        r_loss = tf.reduce_mean(
            mse(post_model.reward_prediction, post_model.r_t) *
            post_model.slots)
        tf.summary.scalar("r_loss", r_loss)
        add_loss += r_loss
    if use_pixel_control:
        change_of_cells = tf.reduce_mean(post_model.s_t[:, 1:, :, :, :] -
                                         post_model.s_t[:, :-1, :, :, :],
                                         axis=-1)
        s_shape = get_shape(change_of_cells)
        s_H, s_W = s_shape[2:]
        ctr_H, ctr_W = get_shape(post_model.pixel_control)[2:4]
        change_of_cells = tf.reduce_mean(tf.reshape(
            change_of_cells,
            shape=s_shape[:2] + [ctr_H, s_H // ctr_H, ctr_W, s_W // ctr_W]),
                                         axis=(3, 5))

        ctr = tf.reduce_sum(
            tf.transpose(post_model.pixel_control, perm=(0, 1, 4, 2, 3)) *
            tf.one_hot(post_model.a_t,
                       depth=post_model.act_space,
                       dtype=tf.float32)[:, :, :, None, None],
            axis=2)[:, :-1, :, :]
        ctr_loss = tf.reduce_mean(mse(ctr, change_of_cells))
        tf.summary.scalar("pixel_control_loss", ctr_loss)
        add_loss += ctr_loss
    if use_pixel_reconstruction:
        rec_loss = tf.reduce_mean(
            mse(post_model.pixel_reconstruction, post_model.s_t) *
            post_model.slots[:, :, None, None, None])
        tf.summary.scalar("rec_loss", rec_loss)
        add_loss += rec_loss

    loss = (FLAGS.pi_coef * p_loss + FLAGS.vf_coef * v_loss -
            ent_coef * entropy_loss + add_loss)

    train_op = miniOp(optimizer, loss, FLAGS.grad_clip)

    new_frames = tf.reduce_sum(post["slots"])

    with tf.control_dependencies([train_op]):
        num_frames_and_train = tf.assign_add(num_frames, new_frames)
        global_step_and_train = tf.assign_add(global_step, 1)

    tf.summary.scalar("learning_rate", lr)
    tf.summary.scalar("ent_coef", ent_coef)
    tf.summary.scalar("ent_loss", entropy_loss)
    tf.summary.scalar("p_loss", p_loss)
    tf.summary.scalar("v_loss", v_loss)
    tf.summary.scalar("all_loss", loss)

    return num_frames_and_train, global_step_and_train
コード例 #7
0
ファイル: ray_trainer.py プロジェクト: hybug/test_ppo
def build_learner(pre, post, act_space, num_frames, batch_weights):
    global_step = tf.train.get_or_create_global_step()
    init_lr = FLAGS.init_lr
    decay = FLAGS.lr_decay
    warmup_steps = FLAGS.warmup_steps
    gamma = FLAGS.gamma
    n_step = FLAGS.n_step
    time_scale = FLAGS.time_scale
    use_hrnn = FLAGS.use_hrnn
    use_rmc = FLAGS.use_rmc
    use_amc = FLAGS.use_amc
    use_beta = FLAGS.use_beta
    use_retrace = FLAGS.use_retrace
    use_reward_prediction = FLAGS.use_reward_prediction
    after_rnn = FLAGS.after_rnn
    use_pixel_control = FLAGS.use_pixel_control
    pq_kl_coef = FLAGS.pq_kl_coef
    p_kl_coef = FLAGS.p_kl_coef
    pi_coef = FLAGS.pi_coef
    vf_coef = FLAGS.vf_coef
    ent_coef = FLAGS.ent_coef
    qf_coef = FLAGS.qf_coef
    ppo_clip = FLAGS.ppo_clip
    vf_clip = FLAGS.vf_clip

    global_step_float = tf.cast(global_step, tf.float32)

    lr = tf.train.polynomial_decay(
        init_lr, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        init_lr / 10.)
    is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32)
    lr = is_warmup * global_step_float / warmup_steps * init_lr + (
        1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay)

    ent_coef = tf.train.polynomial_decay(
        ent_coef, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        ent_coef / 10.)

    optimizer = tf.train.AdamOptimizer(lr)

    if FLAGS.zero_init:
        pre["state_in"] = tf.zeros_like(pre["state_in"])

    if use_hrnn:
        rnn = TmpHierRNN(time_scale,
                         64,
                         4,
                         2,
                         8,
                         'lstm',
                         'rmc',
                         return_sequences=True,
                         return_state=True,
                         name="hrnn")
    elif use_rmc:
        rnn = RMCRNN(64,
                     4,
                     64,
                     return_sequences=True,
                     return_state=True,
                     name="rmc")
    elif use_amc:
        rnn = AMCRNN(64,
                     4,
                     64,
                     return_sequences=True,
                     return_state=True,
                     name="amc")
    else:
        rnn = tf.compat.v1.keras.layers.CuDNNLSTM(256,
                                                  return_sequences=True,
                                                  return_state=True,
                                                  name="lstm")

    pre_model = Model(act_space, gamma, n_step, rnn, use_hrnn, use_rmc,
                      use_amc, use_beta, use_reward_prediction, after_rnn,
                      use_pixel_control, False, **pre)

    post["state_in"] = tf.stop_gradient(pre_model.state_out)

    post_model = Model(act_space, gamma, n_step, rnn, use_hrnn, use_rmc,
                       use_amc, use_beta, use_reward_prediction, after_rnn,
                       use_pixel_control, True, **post)

    tf.summary.scalar("adv_mean", post_model.adv_mean)
    tf.summary.scalar("adv_std", post_model.adv_std)

    if use_retrace:
        q_loss = mse(post_model.qa, post_model.retrace_qs)
    else:
        q_loss = mse(post_model.qa, post_model.n_step_qs)
    # q_loss = mse(
    #     post_model.qa,
    #     tf.stop_gradient(
    #         post_model.current_value[:, :-n_step] + post_model.adv))
    q_loss = tf.reduce_mean(q_loss * post_model.mask[:, :-n_step] *
                            batch_weights[:, None]) + 3.0 * tf.reduce_mean(
                                q_loss * post_model.mask[:, :-n_step] *
                                (1.0 - batch_weights[:, None]))

    ent_loss = tf.reduce_mean(
        entropy_from_logits(post_model.current_act_logits) * post_model.mask *
        batch_weights[:, None])

    losses = dPPOcC(
        act=post_model.a[:, 1:1 - n_step],
        policy_logits=post_model.current_act_logits[:, :-n_step, :],
        behavior_logits=post_model.behavior_logits[:, :-n_step, :],
        advantage=post_model.adv,
        policy_clip=ppo_clip,
        vf=post_model.current_value[:, :-n_step],
        vf_target=post_model.vs,
        value_clip=vf_clip,
        old_vf=post_model.old_vf[:, :-n_step])
    p_loss = tf.reduce_mean(losses.p_loss * post_model.mask[:, :-n_step] *
                            batch_weights[:, None])
    v_loss = tf.reduce_mean(losses.v_loss * post_model.mask[:, :-n_step] *
                            batch_weights[:, None])

    add_loss = 0.0
    if use_hrnn:
        pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas,
                                       post_model.p_mus, post_model.p_sigmas)
        pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.mask)

        p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas,
                                      tf.zeros_like(post_model.p_mus),
                                      0.01 * tf.ones_like(post_model.p_sigmas))
        p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.mask)

        with tf.name_scope("hierarchy_loss"):
            tf.summary.scalar("kl_div_pq", pq_kl_loss)
            tf.summary.scalar("kl_div_prior", p_kl_loss)
        add_loss += pq_kl_coef * pq_kl_loss
        add_loss += p_kl_coef * p_kl_loss

    if use_reward_prediction:
        r_loss = tf.reduce_mean(
            mse(post_model.reward_prediction, post_model.r[:, 1:1 - n_step]) *
            post_model.mask[:, :-n_step])
        tf.summary.scalar("r_loss", r_loss)
        add_loss += r_loss

    if use_pixel_control:
        s = tf.cast(post_model.s[:, :1 - n_step, :, :, :], tf.float32) / 255.0
        target = s[:, 1:, :, :, :] - s[:, :-1, :, :, :]
        shape = get_shape(target)
        target = tf.reshape(
            target,
            (shape[0], shape[1], 4, shape[2] // 4, 4, shape[3] // 4, shape[4]))
        target = tf.reduce_mean(target, axis=(2, 4))
        pixel_loss = tf.reduce_mean(
            mse(post_model.pixel_control, target) *
            post_model.mask[:, :-n_step, None, None, None])
        with tf.name_scope("control_loss"):
            tf.summary.scalar("pixel_control_loss", pixel_loss)
        add_loss += pixel_loss

    loss = (qf_coef * q_loss + vf_coef * v_loss + pi_coef * p_loss -
            ent_coef * ent_loss + add_loss)

    abs_td = post_model.mask[:, :-n_step] * tf.abs(
        post_model.qa - post_model.n_step_rewards +
        gamma**n_step * post_model.qa1)
    avg_p = tf.reduce_mean(abs_td, axis=-1)
    max_p = tf.reduce_max(abs_td, axis=-1)
    priority = 0.9 * max_p + 0.1 * avg_p

    beta = tf.train.polynomial_decay(
        0.4, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        1.0)

    train_op = miniOp(optimizer, loss, FLAGS.grad_clip)

    if FLAGS.smooth_update:
        init_target_op = assignOp(1.0, {"q": "q_target"})
        target_op = assignOp(1.0 / FLAGS.target_update, {"q": "q_target"})
    else:
        init_target_op = assignOp(1.0, {"q": "q_target"})
        target_op = tf.no_op()

    dependency = [train_op, target_op]

    new_frames = tf.reduce_sum(post["mask"])

    with tf.control_dependencies(dependency):
        num_frames_and_train = tf.assign_add(num_frames, new_frames)
        global_step_and_train = tf.assign_add(global_step, 1)

    tf.summary.scalar("learning_rate", lr)
    tf.summary.scalar("pi_loss", p_loss)
    tf.summary.scalar("q_loss", q_loss)
    tf.summary.scalar("v_loss", v_loss)
    tf.summary.scalar("ent_loss", ent_loss)
    tf.summary.scalar("all_loss", loss)

    return num_frames_and_train, global_step_and_train, init_target_op, priority, beta
コード例 #8
0
def build_learner(pre, post, act_space, num_frames, samples_from_replayBuffer,
                  buffer_size, capacity):
    global_step = tf.train.get_or_create_global_step()
    init_lr = FLAGS.init_lr
    decay = FLAGS.lr_decay
    warmup_steps = FLAGS.warmup_steps
    gamma = FLAGS.gamma
    use_double = FLAGS.use_double

    global_step_float = tf.cast(global_step, tf.float32)

    lr = tf.train.polynomial_decay(
        init_lr, global_step, FLAGS.total_environment_frames *
        FLAGS.num_replay // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.)
    is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32)
    lr = is_warmup * global_step_float / warmup_steps * init_lr + (
        1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay)

    ent_coef = tf.train.polynomial_decay(
        FLAGS.ent_coef, global_step, FLAGS.total_environment_frames *
        FLAGS.num_replay // (FLAGS.batch_size * FLAGS.seqlen),
        FLAGS.ent_coef / 10.)

    if FLAGS.opt == "adam":
        optimizer = tf.train.AdamOptimizer(lr)
    else:
        optimizer = tf.train.RMSPropOptimizer(lr, epsilon=0.01)

    if FLAGS.zero_init:
        pre["state_in"] = tf.zeros_like(pre["state_in"])

    lstm = tf.compat.v1.keras.layers.LSTM(256,
                                          return_sequences=True,
                                          return_state=True,
                                          name="lstm")
    pre_model = Model(act_space, lstm, gamma, use_double, "agent", **pre)

    post["state_in"] = tf.stop_gradient(pre_model.state_out)

    post_model = Model(act_space, lstm, gamma, use_double, "agent", **post)

    if FLAGS.only_vtrace:
        advantage = post_model.vtrace_advantage
    else:
        advantage = tf.cond(buffer_size < capacity,
                            lambda: post_model.advantage,
                            lambda: post_model.vtrace_advantage)
    adv_mean = tf.reduce_mean(advantage)
    tf.summary.scalar("adv_mean", adv_mean)
    advantage = advantage - adv_mean
    adv_std = tf.math.sqrt(tf.reduce_mean(advantage**2))
    tf.summary.scalar("adv_std", adv_std)
    if FLAGS.normalize_advantage:
        advantage = advantage / tf.maximum(adv_std, 1e-12)

    ppo_loss = dPPOcC(post_model.a, post_model.act_logits,
                      post_model.old_act_logits, advantage, FLAGS.ppo_clip,
                      post_model.vf, post_model.v_tar, FLAGS.vf_clip,
                      post_model.v_cur)
    vtrace_loss = dPPOcC(post_model.a, post_model.act_logits,
                         post_model.old_act_logits, advantage, FLAGS.ppo_clip,
                         post_model.vf, post_model.vtrace_vf, FLAGS.vf_clip,
                         post_model.v_cur)
    if FLAGS.only_vtrace:
        p_loss = tf.reduce_mean((vtrace_loss.p_loss *
                                 post_model.slots)[:samples_from_replayBuffer])
    else:
        p_loss = tf.cond(
            buffer_size < capacity, lambda: tf.reduce_mean(
                (ppo_loss.p_loss * post_model.slots)[:samples_from_replayBuffer
                                                     ]),
            lambda: tf.reduce_mean((vtrace_loss.p_loss * post_model.slots)
                                   [:samples_from_replayBuffer]))
    if FLAGS.only_vtrace:
        v_loss = tf.reduce_mean((vtrace_loss.v_loss *
                                 post_model.slots)[:samples_from_replayBuffer])
    else:
        v_loss = tf.cond(
            buffer_size < capacity, lambda: tf.reduce_mean(
                (ppo_loss.v_loss * post_model.slots)[:samples_from_replayBuffer
                                                     ]),
            lambda: tf.reduce_mean((vtrace_loss.v_loss * post_model.slots)
                                   [:samples_from_replayBuffer]))

    ent_loss = tf.reduce_mean((entropy(post_model.act_logits) *
                               post_model.slots)[:samples_from_replayBuffer])

    if FLAGS.rescale:
        target = rescaleTarget(post_model.n_step_r, gamma**FLAGS.n_step,
                               post_model.qa1)
    else:
        target = (post_model.n_step_r + gamma**FLAGS.n_step * post_model.qa1)

    q_loss = tf.reduce_mean(
        mse(post_model.qa, tf.stop_gradient(target)) * post_model.slots)

    loss = tf.cond(
        buffer_size < capacity, lambda:
        (FLAGS.qf_coef * q_loss + FLAGS.vf_coef * v_loss + FLAGS.pi_coef *
         p_loss - ent_coef * ent_loss), lambda: FLAGS.qf_coef * q_loss)

    train_op = miniOp(optimizer, loss, FLAGS.grad_clip)

    exp_td = post_model.slots * tf.math.pow(
        tf.abs(post_model.qa -
               (post_model.n_step_r + gamma**FLAGS.n_step * post_model.qa1)),
        0.9)

    avg_p = tf.reduce_sum(exp_td, axis=-1) / (tf.reduce_sum(post_model.slots,
                                                            axis=-1))
    max_p = tf.reduce_max(exp_td, axis=-1)

    priority = 0.9 * max_p + 0.1 * avg_p
    priority = tf.cast(-10000 * priority, tf.int64)

    dependency = [train_op]
    if use_double:
        init_target_op = assignOp(1.0, {"current": "target"})
        if FLAGS.smooth_update:
            assign_op = assignOp(1.0 / FLAGS.target_update,
                                 {"current": "target"})
            dependency += [assign_op]
    else:
        init_target_op = []

    new_frames = tf.reduce_sum(post["slots"])

    with tf.control_dependencies(dependency):
        global_step_and_train = tf.assign_add(global_step, 1)
        num_frames_and_train = tf.assign_add(num_frames, new_frames)

    tf.summary.scalar("learning_rate", lr)
    tf.summary.scalar("all_loss", loss)
    tf.summary.scalar("p_loss", p_loss)
    tf.summary.scalar("q_loss", q_loss)
    tf.summary.scalar("v_loss", v_loss)
    tf.summary.scalar("ent_loss", ent_loss)
    tf.summary.scalar("ent_coef", ent_coef)

    return (num_frames_and_train, global_step_and_train, init_target_op,
            priority)
コード例 #9
0
ファイル: ray_trainer.py プロジェクト: hybug/test_ppo
def build_learner(pre, post, act_space, num_frames):
    global_step = tf.train.get_or_create_global_step()
    init_lr = FLAGS.init_lr
    decay = FLAGS.lr_decay
    warmup_steps = FLAGS.warmup_steps
    gamma = FLAGS.gamma
    use_hrnn = FLAGS.use_hrnn
    use_reward_prediction = FLAGS.use_reward_prediction
    after_rnn = FLAGS.after_rnn
    use_pixel_control = FLAGS.use_pixel_control
    use_pixel_reconstruction = FLAGS.use_pixel_reconstruction
    pq_kl_coef = FLAGS.pq_kl_coef
    p_kl_coef = FLAGS.p_kl_coef

    global_step_float = tf.cast(global_step, tf.float32)

    lr = tf.train.polynomial_decay(
        init_lr, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        init_lr / 10.)
    is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32)
    lr = is_warmup * global_step_float / warmup_steps * init_lr + (
        1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay)
    optimizer = tf.train.AdamOptimizer(lr)

    ent_coef = tf.train.polynomial_decay(
        FLAGS.ent_coef, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        FLAGS.ent_coef / 10.)

    if FLAGS.zero_init:
        pre["state_in"] = tf.zeros_like(pre["state_in"])

    if use_hrnn:
        rnn = TmpHierRNN(4,
                         64,
                         4,
                         2,
                         8,
                         'lstm',
                         'rmc',
                         return_sequences=True,
                         return_state=True,
                         name="hrnn")
    else:
        rnn = tf.compat.v1.keras.layers.LSTM(256,
                                             return_sequences=True,
                                             return_state=True,
                                             name="lstm")
    pre_model = Model(act_space, gamma, rnn, use_hrnn, use_reward_prediction,
                      after_rnn, use_pixel_control, use_pixel_reconstruction,
                      False, **pre)

    post["state_in"] = tf.stop_gradient(pre_model.state_out)

    post_model = Model(act_space, gamma, rnn, use_hrnn, use_reward_prediction,
                       after_rnn, use_pixel_control, use_pixel_reconstruction,
                       True, **post)

    tf.summary.scalar("adv_mean", post_model.adv_mean)
    tf.summary.scalar("adv_std", post_model.adv_std)

    losses = dPPOcC(act=post_model.a,
                    policy_logits=post_model.current_act_logits,
                    behavior_logits=post_model.behavior_logits,
                    advantage=post_model.adv,
                    policy_clip=FLAGS.ppo_clip,
                    vf=post_model.current_value,
                    vf_target=post_model.vs,
                    value_clip=FLAGS.vf_clip,
                    old_vf=post_model.old_vf)

    entropy_loss = tf.reduce_mean(
        entropy(post_model.current_act_logits) * post_model.mask[:, :-1])

    p_loss = tf.reduce_mean(losses.p_loss * post_model.mask[:, :-1])
    v_loss = tf.reduce_mean(losses.v_loss * post_model.mask[:, :-1])

    add_loss = 0.0
    if use_hrnn:
        pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas,
                                       post_model.p_mus, post_model.p_sigmas)
        pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.mask)

        p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas,
                                      tf.zeros_like(post_model.p_mus),
                                      0.01 * tf.ones_like(post_model.p_sigmas))
        p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.mask)

        with tf.name_scope("hierarchy_loss"):
            tf.summary.scalar("kl_div_pq", pq_kl_loss)
            tf.summary.scalar("kl_div_prior", p_kl_loss)
        add_loss += pq_kl_coef * pq_kl_loss
        add_loss += p_kl_coef * p_kl_loss
    if use_reward_prediction:
        r_loss = tf.reduce_mean(
            mse(post_model.reward_prediction, post_model.r) *
            post_model.mask[:, :-1])
        tf.summary.scalar("r_loss", r_loss)
        add_loss += r_loss
    if use_pixel_control:
        s = tf.cast(post_model.s, tf.float32) / 255.0
        target = s[:, 1:, :, :, :] - s[:, :-1, :, :, :]
        shape = get_shape(target)
        target = tf.reshape(
            target,
            (shape[0], shape[1], 4, shape[2] // 4, 4, shape[3] // 4, shape[4]))
        target = tf.reduce_mean(target, axis=(2, 4))
        pixel_loss = tf.reduce_mean(
            mse(post_model.pixel_control, target) *
            post_model.mask[:, :-1, None, None, None])
        with tf.name_scope("control_loss"):
            tf.summary.scalar("pixel_control_loss", pixel_loss)
        add_loss += pixel_loss
    if use_pixel_reconstruction:
        s = tf.cast(post_model.s, tf.float32) / 255.0
        rec_loss = tf.reduce_mean(
            mse(post_model.pixel_reconstruction, s[:, :-1, :, :, :]) *
            post_model.mask[:, :-1, None, None, None])
        tf.summary.scalar("rec_loss", rec_loss)
        add_loss += rec_loss

    loss = (FLAGS.pi_coef * p_loss + FLAGS.vf_coef * v_loss -
            ent_coef * entropy_loss + add_loss)

    train_op = miniOp(optimizer, loss, FLAGS.grad_clip)

    new_frames = tf.reduce_sum(post["mask"])

    with tf.control_dependencies([train_op]):
        num_frames_and_train = tf.assign_add(num_frames, new_frames)
        global_step_and_train = tf.assign_add(global_step, 1)

    tf.summary.scalar("learning_rate", lr)
    tf.summary.scalar("ent_coef", ent_coef)
    tf.summary.scalar("ent_loss", entropy_loss)
    tf.summary.scalar("p_loss", p_loss)
    tf.summary.scalar("v_loss", v_loss)
    tf.summary.scalar("all_loss", loss)

    return num_frames_and_train, global_step_and_train