def even_input(off, size): def even_s(off, size): off = array_ops.reshape(off, [-1, size//2, 2]) off = array_ops.reshape(array_ops.reverse(off, [2]), [-1, size]) return off def odd_s(off, size): off, helper = array_ops.split(off, [size-1, 1], 1) size -= 1 off = even_s(off, size) off = array_ops.concat([off, helper], 1) return off off = control_flow_ops.cond(gen_math_ops.equal(gen_math_ops.mod(size, 2), 0), lambda: even_s(off, size), lambda: odd_s(off, size)) return off
def layer_tunable(x, i): diag_vec = diag_vec_list.read(i) off_vec = off_vec_list.read(i) diag = math_ops.multiply(x, diag_vec) off = math_ops.multiply(x, off_vec) def even_input(off, size): def even_s(off, size): off = array_ops.reshape(off, [-1, size // 2, 2]) off = array_ops.reshape(array_ops.reverse(off, [2]), [-1, size]) return off def odd_s(off, size): off, helper = array_ops.split(off, [size - 1, 1], 1) size -= 1 off = even_s(off, size) off = array_ops.concat([off, helper], 1) return off off = control_flow_ops.cond( gen_math_ops.equal(gen_math_ops.mod(size, 2), 0), lambda: even_s(off, size), lambda: odd_s(off, size)) return off def odd_input(off, size): helper, off = array_ops.split(off, [1, size - 1], 1) size -= 1 off = even_input(off, size) off = array_ops.concat([helper, off], 1) return off size = int(off.get_shape()[1]) off = control_flow_ops.cond( gen_math_ops.equal(gen_math_ops.mod(i, 2), 0), lambda: even_input(off, size), lambda: odd_input(off, size)) layer_output = diag + off i += 1 return layer_output, i
def test_ppo_ops_gae(self): ops.reset_default_graph() np.random.seed(42) random_seed.set_random_seed(42) env = gym.make('CartPole-v0') env.seed(42) # Setup the policy and model global_step = training_util.get_or_create_global_step() deterministic_ph = array_ops.placeholder(dtypes.bool, [], name='deterministic') exploration_op = learning_rate_decay.exponential_decay( PPOTest.hparams.initial_exploration, global_step, PPOTest.hparams.exploration_decay_steps, PPOTest.hparams.exploration_decay_rate) state_distribution, state_ph = gym_ops.distribution_from_gym_space( env.observation_space, name='state_space') # values with variable_scope.variable_scope('logits'): body_op = mlp(state_ph, PPOTest.hparams.hidden_layers) action_distribution, action_value_op = gym_ops.distribution_from_gym_space( env.action_space, logits=[body_op], name='action_space') action_op = array_ops.squeeze( sampling_ops.epsilon_greedy(action_distribution, exploration_op, deterministic_ph)) body_op = core.dense(body_op, units=PPOTest.hparams.value_units, activation=nn_ops.relu, use_bias=False) value_op = array_ops.squeeze( core.dense(body_op, units=1, use_bias=False), -1) policy_variables = variables.trainable_variables(scope='logits') # target with variable_scope.variable_scope('old_logits'): old_body_op = mlp(state_ph, PPOTest.hparams.hidden_layers) old_action_distribution, old_action_value_op = gym_ops.distribution_from_gym_space( env.action_space, logits=[old_body_op], name='action_space') assign_policy_op = shortcuts.assign_scope('logits', 'old_logits') # Setup the dataset stream = streams.Uniform.from_distributions(state_distribution, action_distribution, with_values=True) replay_dataset = dataset.ReplayDataset( stream, max_sequence_length=PPOTest.hparams.max_sequence_length) replay_dataset = replay_dataset.batch(PPOTest.hparams.batch_size) replay_op = replay_dataset.make_one_shot_iterator().get_next() action_ph = array_ops.placeholder(stream.action_dtype, [None, None] + stream.action_shape, name='action') value_ph = array_ops.placeholder(stream.reward_dtype, [None, None] + stream.reward_shape, name='value') reward_ph = array_ops.placeholder(stream.reward_dtype, [None, None] + stream.reward_shape, name='reward') terminal_ph = array_ops.placeholder(dtypes.bool, [None, None], name='terminal') sequence_length_ph = array_ops.placeholder(dtypes.int32, [None, 1], name='sequence_length') sequence_length = array_ops.squeeze(sequence_length_ph, -1) # Setup the loss/optimization procedure advantage_op, return_op = ppo_ops.generalized_advantage_estimate( reward_ph, value_ph, sequence_length, max_sequence_length=PPOTest.hparams.max_sequence_length, weights=(1 - math_ops.cast(terminal_ph, reward_ph.dtype)), discount=PPOTest.hparams.discount, lambda_td=PPOTest.hparams.lambda_td) # actor loss logits_prob = action_distribution.log_prob(action_ph) old_logits_prob = old_action_distribution.log_prob(action_ph) ratio = math_ops.exp(logits_prob - old_logits_prob) clipped_ratio = clip_ops.clip_by_value(ratio, 1. - PPOTest.hparams.epsilon, 1. + PPOTest.hparams.epsilon) actor_loss_op = -math_ops.minimum(ratio * advantage_op, clipped_ratio * advantage_op) critic_loss_op = math_ops.square( value_op - return_op) * PPOTest.hparams.value_coeff entropy_loss_op = -action_distribution.entropy( name='entropy') * PPOTest.hparams.entropy_coeff loss_op = actor_loss_op + critic_loss_op + entropy_loss_op # total loss loss_op = math_ops.reduce_mean( math_ops.reduce_sum(loss_op, axis=-1) / math_ops.cast(sequence_length, loss_op.dtype)) optimizer = adam.AdamOptimizer( learning_rate=PPOTest.hparams.learning_rate) train_op = optimizer.minimize(loss_op, var_list=policy_variables) train_op = control_flow_ops.cond( gen_math_ops.equal( gen_math_ops.mod( ops.convert_to_tensor(PPOTest.hparams.assign_policy_steps, dtype=dtypes.int64), (global_step + 1)), 0), lambda: control_flow_ops.group(*[train_op, assign_policy_op]), lambda: train_op) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(assign_policy_op) for iteration in range(PPOTest.hparams.num_iterations): rewards = gym_test_utils.rollout_with_values_on_gym_env( sess, env, state_ph, deterministic_ph, action_value_op, action_op, value_op, num_episodes=PPOTest.hparams.num_episodes, stream=stream) while True: try: replay = sess.run(replay_op) except (errors_impl.InvalidArgumentError, errors_impl.OutOfRangeError): break _, loss = sess.run( (train_op, loss_op), feed_dict={ state_ph: replay.state, action_ph: replay.action, value_ph: replay.value, reward_ph: replay.reward, terminal_ph: replay.terminal, sequence_length_ph: replay.sequence_length, }) print(loss) rewards = gym_test_utils.rollout_on_gym_env( sess, env, state_ph, deterministic_ph, action_value_op, action_op, num_episodes=PPOTest.hparams.num_episodes, deterministic=True, save_replay=False) print('average_rewards = {}'.format( rewards / PPOTest.hparams.num_episodes))
def test_q_ops_quantile_dqn(self): env = gym.make('CartPole-v0') ops.reset_default_graph() np.random.seed(42) random_seed.set_random_seed(42) env.seed(42) # Setup the policy and model global_step = training_util.get_or_create_global_step() deterministic_ph = array_ops.placeholder( dtypes.bool, [], name='deterministic') exploration_op = learning_rate_decay.exponential_decay( QTest.hparams.initial_exploration, global_step, QTest.hparams.exploration_decay_steps, QTest.hparams.exploration_decay_rate) state_distribution, state_ph = gym_ops.distribution_from_gym_space( env.observation_space, name='state_space') action_distribution, _ = gym_ops.distribution_from_gym_space( env.action_space, name='action_space') # Setup the dataset stream = streams.Uniform.from_distributions( state_distribution, action_distribution) with variable_scope.variable_scope('logits'): action_value_op = mlp(state_ph, QTest.hparams.hidden_layers) action_value_op = core.dense( action_value_op, stream.action_value_shape[-1] * QTest.hparams.num_quantiles, use_bias=False) action_value_op_shape = array_ops.shape(action_value_op) action_value_shape = [ action_value_op_shape[0], action_value_op_shape[1], stream.action_value_shape[-1], QTest.hparams.num_quantiles] action_value_op = gen_array_ops.reshape(action_value_op, action_value_shape) mean_action_value_op = math_ops.reduce_mean(action_value_op, axis=-1) action_op = math_ops.argmax(mean_action_value_op, axis=-1) action_op = array_ops.squeeze(action_op) policy_variables = variables.trainable_variables(scope='logits') next_state_ph = shortcuts.placeholder_like(state_ph, name='next_state_space') with variable_scope.variable_scope('targets'): target_next_action_value_op = mlp(next_state_ph, QTest.hparams.hidden_layers) target_next_action_value_op = core.dense( target_next_action_value_op, stream.action_value_shape[-1] * QTest.hparams.num_quantiles, use_bias=False) target_next_action_value_op_shape = array_ops.shape(target_next_action_value_op) target_next_action_value_shape = [ target_next_action_value_op_shape[0], target_next_action_value_op_shape[1], stream.action_value_shape[-1], QTest.hparams.num_quantiles] target_next_action_value_op = gen_array_ops.reshape( target_next_action_value_op, target_next_action_value_shape) mean_target_next_action_value_op = math_ops.reduce_mean( target_next_action_value_op, axis=-1) assign_target_op = shortcuts.assign_scope('logits', 'target_logits') replay_dataset = dataset.ReplayDataset( stream, max_sequence_length=QTest.hparams.max_sequence_length) replay_dataset = replay_dataset.batch(QTest.hparams.batch_size) replay_op = replay_dataset.make_one_shot_iterator().get_next() action_ph = array_ops.placeholder( stream.action_dtype, [None, None] + stream.action_shape, name='action') reward_ph = array_ops.placeholder( stream.reward_dtype, [None, None] + stream.reward_shape, name='reward') terminal_ph = array_ops.placeholder( dtypes.bool, [None, None], name='terminal') sequence_length_ph = array_ops.placeholder( dtypes.int32, [None, 1], name='sequence_length') sequence_length = array_ops.squeeze(sequence_length_ph, -1) q_value_op, expected_q_value_op = q_ops.expected_q_value( array_ops.expand_dims(reward_ph, -1), action_ph, action_value_op, (target_next_action_value_op, mean_target_next_action_value_op), weights=array_ops.expand_dims( 1 - math_ops.cast(terminal_ph, reward_ph.dtype), -1), discount=QTest.hparams.discount) u = expected_q_value_op - q_value_op loss_op = losses_impl.huber_loss(u, delta=QTest.hparams.huber_loss_delta) tau_op = (2. * math_ops.range( 0, QTest.hparams.num_quantiles, dtype=u.dtype) + 1) / ( 2. * QTest.hparams.num_quantiles) loss_op *= math_ops.abs(tau_op - math_ops.cast(u < 0, tau_op.dtype)) loss_op = math_ops.reduce_mean(loss_op, axis=-1) loss_op = math_ops.reduce_mean( math_ops.reduce_sum(loss_op, axis=-1) / math_ops.cast( sequence_length, loss_op.dtype)) optimizer = adam.AdamOptimizer( learning_rate=QTest.hparams.learning_rate) train_op = optimizer.minimize(loss_op, var_list=policy_variables) train_op = control_flow_ops.cond( gen_math_ops.equal( gen_math_ops.mod( ops.convert_to_tensor( QTest.hparams.assign_target_steps, dtype=dtypes.int64), (global_step + 1)), 0), lambda: control_flow_ops.group(*[train_op, assign_target_op]), lambda: train_op) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(assign_target_op) for iteration in range(QTest.hparams.num_iterations): rewards = gym_test_utils.rollout_on_gym_env( sess, env, state_ph, deterministic_ph, mean_action_value_op, action_op, num_episodes=QTest.hparams.num_episodes, stream=stream) while True: try: replay = sess.run(replay_op) except (errors_impl.InvalidArgumentError, errors_impl.OutOfRangeError): break loss, _ = sess.run( (loss_op, train_op), feed_dict={ state_ph: replay.state, next_state_ph: replay.next_state, action_ph: replay.action, reward_ph: replay.reward, terminal_ph: replay.terminal, sequence_length_ph: replay.sequence_length, }) rewards = gym_test_utils.rollout_on_gym_env( sess, env, state_ph, deterministic_ph, mean_action_value_op, action_op, num_episodes=QTest.hparams.num_episodes, deterministic=True, save_replay=False) print('average_rewards = {}'.format(rewards / QTest.hparams.num_episodes))