def dSAC(act, policy_logits, rewards, qf1, qf2, vf, next_vf_target, ALPHA, GAMMA, normalize_advantage=False): log_p = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=act, logits=policy_logits) v_loss = mse(vf, tf.stop_gradient(tf.minimum(qf1, qf2) - log_p / ALPHA)) qf_target = tf.stop_gradient(rewards + GAMMA * next_vf_target) q1_loss = mse(qf1, qf_target) if qf2 is None: q2_loss = 0. else: q2_loss = mse(qf2, qf_target) q_loss = q1_loss + q2_loss advantage = qf1 - vf - log_p / ALPHA adv_mean = tf.reduce_mean(advantage) tf.summary.scalar("adv_mean", adv_mean) advantage_center = advantage - adv_mean adv_std = tf.sqrt(tf.reduce_mean(advantage_center**2)) tf.summary.scalar("adv_std", adv_std) if normalize_advantage: advantage = advantage_center / tf.maximum(adv_std, 1e-12) p_loss = -log_p * tf.stop_gradient(advantage) return SACloss(p_loss, q_loss, v_loss)
def dPPOcC(act, policy_logits, behavior_logits, advantage, policy_clip, vf, vf_target, value_clip, old_vf): a_loss = dPPOc(act=act, policy_logits=policy_logits, behavior_logits=behavior_logits, advantage=advantage, clip=policy_clip) c_loss = mse(y_hat=vf, y_target=vf_target, clip=value_clip, clip_center=old_vf) return PPOcCloss(a_loss, c_loss)
def dAC(act, policy_logits, advantage, vf, vf_target, value_clip=None, old_vf=None): ros = IS_from_logits(policy_logits=policy_logits, act=act) a_loss = -advantage * ros c_loss = mse(y_hat=vf, y_target=vf_target, clip=value_clip, clip_center=old_vf) return ACloss(a_loss, c_loss)
def build_learner(pre, post, act_space, num_frames): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames * FLAGS.num_replay // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) if FLAGS.opt == "adam": optimizer = tf.train.AdamOptimizer(lr) else: optimizer = tf.train.RMSPropOptimizer(lr, epsilon=0.01) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) lstm = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, lstm, "agent", **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, lstm, "agent", **post) if FLAGS.rescale: target = rescaleTarget(post_model.r, FLAGS.gamma**FLAGS.n_step, post_model.qa1) else: target = post_model.r + FLAGS.gamma**FLAGS.n_step * post_model.qa1 loss = 100. * tf.reduce_mean( post_model.slots * mse(post_model.qa, tf.stop_gradient(target))) exp_td = post_model.slots * tf.math.pow( tf.abs(post_model.qa - (post_model.r + FLAGS.gamma**FLAGS.n_step * post_model.qa1)), 0.9) avg_p = tf.reduce_sum(exp_td, axis=-1) / (tf.reduce_sum(post_model.slots, axis=-1)) max_p = tf.reduce_max(exp_td, axis=-1) priority = 0.9 * max_p + 0.1 * avg_p priority = tf.cast(-10000 * priority, tf.int64) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) init_target_op = assignOp( 1.0, {post_model.scope + "_current": post_model.scope + "_target"}) if FLAGS.smooth_update: assign_op = assignOp( 1.0 / FLAGS.target_update, {post_model.scope + "_current": post_model.scope + "_target"}) dependency = [train_op, assign_op] else: dependency = [train_op] new_frames = tf.reduce_sum(post["slots"]) with tf.control_dependencies(dependency): global_step_and_train = tf.assign_add(global_step, 1) num_frames_and_train = tf.assign_add(num_frames, new_frames) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("all_loss", loss) return (num_frames_and_train, global_step_and_train, init_target_op, priority)
def build_learner(pre, post, ws, act_space, num_frames): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps gamma = FLAGS.gamma n_step = FLAGS.n_step use_soft = FLAGS.use_soft time_scale = FLAGS.time_scale use_hrnn = FLAGS.use_hrnn use_reward_prediction = FLAGS.use_reward_prediction after_rnn = FLAGS.after_rnn use_pixel_control = FLAGS.use_pixel_control pq_kl_coef = FLAGS.pq_kl_coef p_kl_coef = FLAGS.p_kl_coef global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) optimizer = tf.train.AdamOptimizer(lr) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) if use_hrnn: rnn = TmpHierRNN(time_scale, 64, 4, 2, 8, 'lstm', 'rmc', return_sequences=True, return_state=True, name="hrnn") else: rnn = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, gamma, n_step, use_soft, rnn, use_hrnn, use_reward_prediction, after_rnn, use_pixel_control, False, **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, gamma, n_step, use_soft, rnn, use_hrnn, use_reward_prediction, after_rnn, use_pixel_control, True, **post) v_loss = mse( post_model.qa, tf.stop_gradient( rescaleTarget(post_model.n_step_rewards, gamma**n_step, post_model.qa1))) v_loss = FLAGS.vf_coef * tf.reduce_mean( v_loss * post_model.mask[:, :-n_step] * ws[:, None]) add_loss = 0.0 if use_hrnn: pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas, post_model.p_mus, post_model.p_sigmas) pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.mask) p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas, tf.zeros_like(post_model.p_mus), 0.01 * tf.ones_like(post_model.p_sigmas)) p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.mask) with tf.name_scope("hierarchy_loss"): tf.summary.scalar("kl_div_pq", pq_kl_loss) tf.summary.scalar("kl_div_prior", p_kl_loss) add_loss += pq_kl_coef * pq_kl_loss add_loss += p_kl_coef * p_kl_loss if use_reward_prediction: r_loss = tf.reduce_mean( mse(post_model.reward_prediction, post_model.r[:, 1:1 - n_step]) * post_model.mask[:, :-n_step]) tf.summary.scalar("r_loss", r_loss) add_loss += r_loss if use_pixel_control: s = tf.cast(post_model.s[:, :1 - n_step, :, :, :], tf.float32) / 255.0 target = s[:, 1:, :, :, :] - s[:, :-1, :, :, :] shape = get_shape(target) target = tf.reshape( target, (shape[0], shape[1], 4, shape[2] // 4, 4, shape[3] // 4, shape[4])) target = tf.reduce_mean(target, axis=(2, 4)) pixel_loss = tf.reduce_mean( mse(post_model.pixel_control, target) * post_model.mask[:, :-n_step, None, None, None]) with tf.name_scope("control_loss"): tf.summary.scalar("pixel_control_loss", pixel_loss) add_loss += pixel_loss loss = FLAGS.vf_coef * v_loss + add_loss abs_td = post_model.mask[:, :-n_step] * tf.abs( post_model.qa - rescaleTarget(post_model.n_step_rewards, gamma**n_step, post_model.qa1)) avg_p = tf.reduce_mean(abs_td, axis=-1) max_p = tf.reduce_max(abs_td, axis=-1) priority = 0.9 * max_p + 0.1 * avg_p beta = tf.train.polynomial_decay( 0.4, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), 1.0) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) target_op = assignOp(1.0, {"q": "q_target"}) dependency = [train_op] if use_soft: qf_entropy = entropy_from_logits(post_model.qf_logits) target_entropy = tf.train.polynomial_decay( 0.9 * np.log(act_space), global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), 0.5 * np.log(act_space)) ent_loss = tf.reduce_mean( mse(qf_entropy, tf.cast(target_entropy, tf.float32)[None, None])) with tf.name_scope("ent_loss"): tf.summary.scalar("ent_loss", ent_loss) ent_op = miniOp(optimizer, ent_loss, grad_clip=FLAGS.grad_clip, var_scope="temperature") dependency.append(ent_op) new_frames = tf.reduce_sum(post["mask"]) with tf.control_dependencies(dependency): num_frames_and_train = tf.assign_add(num_frames, new_frames) global_step_and_train = tf.assign_add(global_step, 1) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("v_loss", v_loss) tf.summary.scalar("all_loss", loss) return num_frames_and_train, global_step_and_train, target_op, priority, beta
def build_learner(pre, post, act_space, num_frames): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps use_rmc = FLAGS.use_rmc use_hrmc = FLAGS.use_hrmc use_hrnn = FLAGS.use_hrnn use_icm = FLAGS.use_icm use_coex = FLAGS.use_coex use_reward_prediction = FLAGS.use_reward_prediction after_rnn = FLAGS.after_rnn use_pixel_control = FLAGS.use_pixel_control use_pixel_reconstruction = FLAGS.use_pixel_reconstruction pq_kl_coef = FLAGS.pq_kl_coef p_kl_coef = FLAGS.p_kl_coef global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) optimizer = tf.train.AdamOptimizer(lr) ent_coef = tf.train.polynomial_decay( FLAGS.ent_coef, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), FLAGS.ent_coef / 10.) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) if use_hrnn: rnn = TmpHierRNN(4, 64, 4, 2, 8, 'lstm', 'rmc', return_sequences=True, return_state=True, name="hrnn") elif use_hrmc: rnn = TmpHierRMCRNN(4, 64, 4, 4, return_sequences=True, return_state=True, name="hrmcrnn") elif use_rmc: rnn = RMCRNN(64, 4, 4, return_sequences=True, return_state=True, name="rmcrnn") else: rnn = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, rnn, use_rmc, use_hrmc or use_hrnn, use_reward_prediction, after_rnn, use_pixel_reconstruction, "agent", **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, rnn, use_rmc, use_hrmc or use_hrnn, use_reward_prediction, after_rnn, use_pixel_reconstruction, "agent", **post) tf.summary.scalar("adv_mean", post_model.adv_mean) tf.summary.scalar("adv_std", post_model.adv_std) losses = dPPOcC(act=post_model.a_t, policy_logits=post_model.current_act_logits, old_policy_logits=post_model.old_act_logits, advantage=post_model.advantage, policy_clip=FLAGS.ppo_clip, vf=post_model.current_value, vf_target=post_model.ret, value_clip=FLAGS.vf_clip, old_vf=post_model.old_current_value) entropy_loss = tf.reduce_mean( entropy(post_model.current_act_logits) * post_model.slots) p_loss = tf.reduce_mean(losses.p_loss * post_model.slots) v_loss = tf.reduce_mean(losses.v_loss * post_model.slots) add_loss = 0.0 if use_icm: icmloss = icm(post_model.cnn_feature[:, :-1, :], post_model.cnn_feature[:, 1:, :], post_model.a_t[:, :-1], act_space) add_loss += 0.2 * tf.reduce_mean( icmloss.f_loss * post_model.slots[:, :-1]) + 0.8 * tf.reduce_mean( icmloss.i_loss * post_model.slots[:, :-1]) if use_coex: coexloss = coex(post_model.image_feature[:, :-1, :, :, :], post_model.image_feature[:, 1:, :, :, :], post_model.a_t[:, :-1], act_space) add_loss += tf.reduce_mean(coexloss * post_model.slots[:, :-1]) if use_hrmc or use_hrnn: pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas, post_model.p_mus, post_model.p_sigmas) pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.slots) tf.summary.scalar("kl_div", pq_kl_loss) add_loss += pq_kl_coef * pq_kl_loss p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas, tf.zeros_like(post_model.p_mus), 0.01 * tf.ones_like(post_model.p_sigmas)) p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.slots) tf.summary.scalar("kl_div_prior", p_kl_loss) add_loss += p_kl_coef * p_kl_loss if use_reward_prediction: r_loss = tf.reduce_mean( mse(post_model.reward_prediction, post_model.r_t) * post_model.slots) tf.summary.scalar("r_loss", r_loss) add_loss += r_loss if use_pixel_control: change_of_cells = tf.reduce_mean(post_model.s_t[:, 1:, :, :, :] - post_model.s_t[:, :-1, :, :, :], axis=-1) s_shape = get_shape(change_of_cells) s_H, s_W = s_shape[2:] ctr_H, ctr_W = get_shape(post_model.pixel_control)[2:4] change_of_cells = tf.reduce_mean(tf.reshape( change_of_cells, shape=s_shape[:2] + [ctr_H, s_H // ctr_H, ctr_W, s_W // ctr_W]), axis=(3, 5)) ctr = tf.reduce_sum( tf.transpose(post_model.pixel_control, perm=(0, 1, 4, 2, 3)) * tf.one_hot(post_model.a_t, depth=post_model.act_space, dtype=tf.float32)[:, :, :, None, None], axis=2)[:, :-1, :, :] ctr_loss = tf.reduce_mean(mse(ctr, change_of_cells)) tf.summary.scalar("pixel_control_loss", ctr_loss) add_loss += ctr_loss if use_pixel_reconstruction: rec_loss = tf.reduce_mean( mse(post_model.pixel_reconstruction, post_model.s_t) * post_model.slots[:, :, None, None, None]) tf.summary.scalar("rec_loss", rec_loss) add_loss += rec_loss loss = (FLAGS.pi_coef * p_loss + FLAGS.vf_coef * v_loss - ent_coef * entropy_loss + add_loss) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) new_frames = tf.reduce_sum(post["slots"]) with tf.control_dependencies([train_op]): num_frames_and_train = tf.assign_add(num_frames, new_frames) global_step_and_train = tf.assign_add(global_step, 1) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("ent_coef", ent_coef) tf.summary.scalar("ent_loss", entropy_loss) tf.summary.scalar("p_loss", p_loss) tf.summary.scalar("v_loss", v_loss) tf.summary.scalar("all_loss", loss) return num_frames_and_train, global_step_and_train
def build_learner(pre, post, act_space, num_frames, batch_weights): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps gamma = FLAGS.gamma n_step = FLAGS.n_step time_scale = FLAGS.time_scale use_hrnn = FLAGS.use_hrnn use_rmc = FLAGS.use_rmc use_amc = FLAGS.use_amc use_beta = FLAGS.use_beta use_retrace = FLAGS.use_retrace use_reward_prediction = FLAGS.use_reward_prediction after_rnn = FLAGS.after_rnn use_pixel_control = FLAGS.use_pixel_control pq_kl_coef = FLAGS.pq_kl_coef p_kl_coef = FLAGS.p_kl_coef pi_coef = FLAGS.pi_coef vf_coef = FLAGS.vf_coef ent_coef = FLAGS.ent_coef qf_coef = FLAGS.qf_coef ppo_clip = FLAGS.ppo_clip vf_clip = FLAGS.vf_clip global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) ent_coef = tf.train.polynomial_decay( ent_coef, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), ent_coef / 10.) optimizer = tf.train.AdamOptimizer(lr) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) if use_hrnn: rnn = TmpHierRNN(time_scale, 64, 4, 2, 8, 'lstm', 'rmc', return_sequences=True, return_state=True, name="hrnn") elif use_rmc: rnn = RMCRNN(64, 4, 64, return_sequences=True, return_state=True, name="rmc") elif use_amc: rnn = AMCRNN(64, 4, 64, return_sequences=True, return_state=True, name="amc") else: rnn = tf.compat.v1.keras.layers.CuDNNLSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, gamma, n_step, rnn, use_hrnn, use_rmc, use_amc, use_beta, use_reward_prediction, after_rnn, use_pixel_control, False, **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, gamma, n_step, rnn, use_hrnn, use_rmc, use_amc, use_beta, use_reward_prediction, after_rnn, use_pixel_control, True, **post) tf.summary.scalar("adv_mean", post_model.adv_mean) tf.summary.scalar("adv_std", post_model.adv_std) if use_retrace: q_loss = mse(post_model.qa, post_model.retrace_qs) else: q_loss = mse(post_model.qa, post_model.n_step_qs) # q_loss = mse( # post_model.qa, # tf.stop_gradient( # post_model.current_value[:, :-n_step] + post_model.adv)) q_loss = tf.reduce_mean(q_loss * post_model.mask[:, :-n_step] * batch_weights[:, None]) + 3.0 * tf.reduce_mean( q_loss * post_model.mask[:, :-n_step] * (1.0 - batch_weights[:, None])) ent_loss = tf.reduce_mean( entropy_from_logits(post_model.current_act_logits) * post_model.mask * batch_weights[:, None]) losses = dPPOcC( act=post_model.a[:, 1:1 - n_step], policy_logits=post_model.current_act_logits[:, :-n_step, :], behavior_logits=post_model.behavior_logits[:, :-n_step, :], advantage=post_model.adv, policy_clip=ppo_clip, vf=post_model.current_value[:, :-n_step], vf_target=post_model.vs, value_clip=vf_clip, old_vf=post_model.old_vf[:, :-n_step]) p_loss = tf.reduce_mean(losses.p_loss * post_model.mask[:, :-n_step] * batch_weights[:, None]) v_loss = tf.reduce_mean(losses.v_loss * post_model.mask[:, :-n_step] * batch_weights[:, None]) add_loss = 0.0 if use_hrnn: pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas, post_model.p_mus, post_model.p_sigmas) pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.mask) p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas, tf.zeros_like(post_model.p_mus), 0.01 * tf.ones_like(post_model.p_sigmas)) p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.mask) with tf.name_scope("hierarchy_loss"): tf.summary.scalar("kl_div_pq", pq_kl_loss) tf.summary.scalar("kl_div_prior", p_kl_loss) add_loss += pq_kl_coef * pq_kl_loss add_loss += p_kl_coef * p_kl_loss if use_reward_prediction: r_loss = tf.reduce_mean( mse(post_model.reward_prediction, post_model.r[:, 1:1 - n_step]) * post_model.mask[:, :-n_step]) tf.summary.scalar("r_loss", r_loss) add_loss += r_loss if use_pixel_control: s = tf.cast(post_model.s[:, :1 - n_step, :, :, :], tf.float32) / 255.0 target = s[:, 1:, :, :, :] - s[:, :-1, :, :, :] shape = get_shape(target) target = tf.reshape( target, (shape[0], shape[1], 4, shape[2] // 4, 4, shape[3] // 4, shape[4])) target = tf.reduce_mean(target, axis=(2, 4)) pixel_loss = tf.reduce_mean( mse(post_model.pixel_control, target) * post_model.mask[:, :-n_step, None, None, None]) with tf.name_scope("control_loss"): tf.summary.scalar("pixel_control_loss", pixel_loss) add_loss += pixel_loss loss = (qf_coef * q_loss + vf_coef * v_loss + pi_coef * p_loss - ent_coef * ent_loss + add_loss) abs_td = post_model.mask[:, :-n_step] * tf.abs( post_model.qa - post_model.n_step_rewards + gamma**n_step * post_model.qa1) avg_p = tf.reduce_mean(abs_td, axis=-1) max_p = tf.reduce_max(abs_td, axis=-1) priority = 0.9 * max_p + 0.1 * avg_p beta = tf.train.polynomial_decay( 0.4, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), 1.0) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) if FLAGS.smooth_update: init_target_op = assignOp(1.0, {"q": "q_target"}) target_op = assignOp(1.0 / FLAGS.target_update, {"q": "q_target"}) else: init_target_op = assignOp(1.0, {"q": "q_target"}) target_op = tf.no_op() dependency = [train_op, target_op] new_frames = tf.reduce_sum(post["mask"]) with tf.control_dependencies(dependency): num_frames_and_train = tf.assign_add(num_frames, new_frames) global_step_and_train = tf.assign_add(global_step, 1) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("pi_loss", p_loss) tf.summary.scalar("q_loss", q_loss) tf.summary.scalar("v_loss", v_loss) tf.summary.scalar("ent_loss", ent_loss) tf.summary.scalar("all_loss", loss) return num_frames_and_train, global_step_and_train, init_target_op, priority, beta
def build_learner(pre, post, act_space, num_frames, samples_from_replayBuffer, buffer_size, capacity): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps gamma = FLAGS.gamma use_double = FLAGS.use_double global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames * FLAGS.num_replay // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) ent_coef = tf.train.polynomial_decay( FLAGS.ent_coef, global_step, FLAGS.total_environment_frames * FLAGS.num_replay // (FLAGS.batch_size * FLAGS.seqlen), FLAGS.ent_coef / 10.) if FLAGS.opt == "adam": optimizer = tf.train.AdamOptimizer(lr) else: optimizer = tf.train.RMSPropOptimizer(lr, epsilon=0.01) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) lstm = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, lstm, gamma, use_double, "agent", **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, lstm, gamma, use_double, "agent", **post) if FLAGS.only_vtrace: advantage = post_model.vtrace_advantage else: advantage = tf.cond(buffer_size < capacity, lambda: post_model.advantage, lambda: post_model.vtrace_advantage) adv_mean = tf.reduce_mean(advantage) tf.summary.scalar("adv_mean", adv_mean) advantage = advantage - adv_mean adv_std = tf.math.sqrt(tf.reduce_mean(advantage**2)) tf.summary.scalar("adv_std", adv_std) if FLAGS.normalize_advantage: advantage = advantage / tf.maximum(adv_std, 1e-12) ppo_loss = dPPOcC(post_model.a, post_model.act_logits, post_model.old_act_logits, advantage, FLAGS.ppo_clip, post_model.vf, post_model.v_tar, FLAGS.vf_clip, post_model.v_cur) vtrace_loss = dPPOcC(post_model.a, post_model.act_logits, post_model.old_act_logits, advantage, FLAGS.ppo_clip, post_model.vf, post_model.vtrace_vf, FLAGS.vf_clip, post_model.v_cur) if FLAGS.only_vtrace: p_loss = tf.reduce_mean((vtrace_loss.p_loss * post_model.slots)[:samples_from_replayBuffer]) else: p_loss = tf.cond( buffer_size < capacity, lambda: tf.reduce_mean( (ppo_loss.p_loss * post_model.slots)[:samples_from_replayBuffer ]), lambda: tf.reduce_mean((vtrace_loss.p_loss * post_model.slots) [:samples_from_replayBuffer])) if FLAGS.only_vtrace: v_loss = tf.reduce_mean((vtrace_loss.v_loss * post_model.slots)[:samples_from_replayBuffer]) else: v_loss = tf.cond( buffer_size < capacity, lambda: tf.reduce_mean( (ppo_loss.v_loss * post_model.slots)[:samples_from_replayBuffer ]), lambda: tf.reduce_mean((vtrace_loss.v_loss * post_model.slots) [:samples_from_replayBuffer])) ent_loss = tf.reduce_mean((entropy(post_model.act_logits) * post_model.slots)[:samples_from_replayBuffer]) if FLAGS.rescale: target = rescaleTarget(post_model.n_step_r, gamma**FLAGS.n_step, post_model.qa1) else: target = (post_model.n_step_r + gamma**FLAGS.n_step * post_model.qa1) q_loss = tf.reduce_mean( mse(post_model.qa, tf.stop_gradient(target)) * post_model.slots) loss = tf.cond( buffer_size < capacity, lambda: (FLAGS.qf_coef * q_loss + FLAGS.vf_coef * v_loss + FLAGS.pi_coef * p_loss - ent_coef * ent_loss), lambda: FLAGS.qf_coef * q_loss) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) exp_td = post_model.slots * tf.math.pow( tf.abs(post_model.qa - (post_model.n_step_r + gamma**FLAGS.n_step * post_model.qa1)), 0.9) avg_p = tf.reduce_sum(exp_td, axis=-1) / (tf.reduce_sum(post_model.slots, axis=-1)) max_p = tf.reduce_max(exp_td, axis=-1) priority = 0.9 * max_p + 0.1 * avg_p priority = tf.cast(-10000 * priority, tf.int64) dependency = [train_op] if use_double: init_target_op = assignOp(1.0, {"current": "target"}) if FLAGS.smooth_update: assign_op = assignOp(1.0 / FLAGS.target_update, {"current": "target"}) dependency += [assign_op] else: init_target_op = [] new_frames = tf.reduce_sum(post["slots"]) with tf.control_dependencies(dependency): global_step_and_train = tf.assign_add(global_step, 1) num_frames_and_train = tf.assign_add(num_frames, new_frames) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("all_loss", loss) tf.summary.scalar("p_loss", p_loss) tf.summary.scalar("q_loss", q_loss) tf.summary.scalar("v_loss", v_loss) tf.summary.scalar("ent_loss", ent_loss) tf.summary.scalar("ent_coef", ent_coef) return (num_frames_and_train, global_step_and_train, init_target_op, priority)
def build_learner(pre, post, act_space, num_frames): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps gamma = FLAGS.gamma use_hrnn = FLAGS.use_hrnn use_reward_prediction = FLAGS.use_reward_prediction after_rnn = FLAGS.after_rnn use_pixel_control = FLAGS.use_pixel_control use_pixel_reconstruction = FLAGS.use_pixel_reconstruction pq_kl_coef = FLAGS.pq_kl_coef p_kl_coef = FLAGS.p_kl_coef global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) optimizer = tf.train.AdamOptimizer(lr) ent_coef = tf.train.polynomial_decay( FLAGS.ent_coef, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), FLAGS.ent_coef / 10.) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) if use_hrnn: rnn = TmpHierRNN(4, 64, 4, 2, 8, 'lstm', 'rmc', return_sequences=True, return_state=True, name="hrnn") else: rnn = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, gamma, rnn, use_hrnn, use_reward_prediction, after_rnn, use_pixel_control, use_pixel_reconstruction, False, **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, gamma, rnn, use_hrnn, use_reward_prediction, after_rnn, use_pixel_control, use_pixel_reconstruction, True, **post) tf.summary.scalar("adv_mean", post_model.adv_mean) tf.summary.scalar("adv_std", post_model.adv_std) losses = dPPOcC(act=post_model.a, policy_logits=post_model.current_act_logits, behavior_logits=post_model.behavior_logits, advantage=post_model.adv, policy_clip=FLAGS.ppo_clip, vf=post_model.current_value, vf_target=post_model.vs, value_clip=FLAGS.vf_clip, old_vf=post_model.old_vf) entropy_loss = tf.reduce_mean( entropy(post_model.current_act_logits) * post_model.mask[:, :-1]) p_loss = tf.reduce_mean(losses.p_loss * post_model.mask[:, :-1]) v_loss = tf.reduce_mean(losses.v_loss * post_model.mask[:, :-1]) add_loss = 0.0 if use_hrnn: pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas, post_model.p_mus, post_model.p_sigmas) pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.mask) p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas, tf.zeros_like(post_model.p_mus), 0.01 * tf.ones_like(post_model.p_sigmas)) p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.mask) with tf.name_scope("hierarchy_loss"): tf.summary.scalar("kl_div_pq", pq_kl_loss) tf.summary.scalar("kl_div_prior", p_kl_loss) add_loss += pq_kl_coef * pq_kl_loss add_loss += p_kl_coef * p_kl_loss if use_reward_prediction: r_loss = tf.reduce_mean( mse(post_model.reward_prediction, post_model.r) * post_model.mask[:, :-1]) tf.summary.scalar("r_loss", r_loss) add_loss += r_loss if use_pixel_control: s = tf.cast(post_model.s, tf.float32) / 255.0 target = s[:, 1:, :, :, :] - s[:, :-1, :, :, :] shape = get_shape(target) target = tf.reshape( target, (shape[0], shape[1], 4, shape[2] // 4, 4, shape[3] // 4, shape[4])) target = tf.reduce_mean(target, axis=(2, 4)) pixel_loss = tf.reduce_mean( mse(post_model.pixel_control, target) * post_model.mask[:, :-1, None, None, None]) with tf.name_scope("control_loss"): tf.summary.scalar("pixel_control_loss", pixel_loss) add_loss += pixel_loss if use_pixel_reconstruction: s = tf.cast(post_model.s, tf.float32) / 255.0 rec_loss = tf.reduce_mean( mse(post_model.pixel_reconstruction, s[:, :-1, :, :, :]) * post_model.mask[:, :-1, None, None, None]) tf.summary.scalar("rec_loss", rec_loss) add_loss += rec_loss loss = (FLAGS.pi_coef * p_loss + FLAGS.vf_coef * v_loss - ent_coef * entropy_loss + add_loss) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) new_frames = tf.reduce_sum(post["mask"]) with tf.control_dependencies([train_op]): num_frames_and_train = tf.assign_add(num_frames, new_frames) global_step_and_train = tf.assign_add(global_step, 1) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("ent_coef", ent_coef) tf.summary.scalar("ent_loss", entropy_loss) tf.summary.scalar("p_loss", p_loss) tf.summary.scalar("v_loss", v_loss) tf.summary.scalar("all_loss", loss) return num_frames_and_train, global_step_and_train