Esempio n. 1
0
        def even_input(off, size):

            def even_s(off, size):
                off = array_ops.reshape(off, [-1, size//2, 2])
                off = array_ops.reshape(array_ops.reverse(off, [2]), [-1, size])
                return off

            def odd_s(off, size):
                off, helper = array_ops.split(off, [size-1, 1], 1)
                size -= 1
                off = even_s(off, size)
                off = array_ops.concat([off, helper], 1)
                return off

            off = control_flow_ops.cond(gen_math_ops.equal(gen_math_ops.mod(size, 2), 0), lambda: even_s(off, size), lambda: odd_s(off, size))
            return off
Esempio n. 2
0
    def layer_tunable(x, i):

        diag_vec = diag_vec_list.read(i)
        off_vec = off_vec_list.read(i)

        diag = math_ops.multiply(x, diag_vec)
        off = math_ops.multiply(x, off_vec)

        def even_input(off, size):
            def even_s(off, size):
                off = array_ops.reshape(off, [-1, size // 2, 2])
                off = array_ops.reshape(array_ops.reverse(off, [2]),
                                        [-1, size])
                return off

            def odd_s(off, size):
                off, helper = array_ops.split(off, [size - 1, 1], 1)
                size -= 1
                off = even_s(off, size)
                off = array_ops.concat([off, helper], 1)
                return off

            off = control_flow_ops.cond(
                gen_math_ops.equal(gen_math_ops.mod(size, 2), 0),
                lambda: even_s(off, size), lambda: odd_s(off, size))
            return off

        def odd_input(off, size):
            helper, off = array_ops.split(off, [1, size - 1], 1)
            size -= 1
            off = even_input(off, size)
            off = array_ops.concat([helper, off], 1)
            return off

        size = int(off.get_shape()[1])
        off = control_flow_ops.cond(
            gen_math_ops.equal(gen_math_ops.mod(i, 2), 0),
            lambda: even_input(off, size), lambda: odd_input(off, size))

        layer_output = diag + off
        i += 1

        return layer_output, i
Esempio n. 3
0
    def test_ppo_ops_gae(self):
        ops.reset_default_graph()
        np.random.seed(42)
        random_seed.set_random_seed(42)
        env = gym.make('CartPole-v0')
        env.seed(42)

        # Setup the policy and model
        global_step = training_util.get_or_create_global_step()
        deterministic_ph = array_ops.placeholder(dtypes.bool, [],
                                                 name='deterministic')
        exploration_op = learning_rate_decay.exponential_decay(
            PPOTest.hparams.initial_exploration, global_step,
            PPOTest.hparams.exploration_decay_steps,
            PPOTest.hparams.exploration_decay_rate)

        state_distribution, state_ph = gym_ops.distribution_from_gym_space(
            env.observation_space, name='state_space')

        # values
        with variable_scope.variable_scope('logits'):
            body_op = mlp(state_ph, PPOTest.hparams.hidden_layers)
            action_distribution, action_value_op = gym_ops.distribution_from_gym_space(
                env.action_space, logits=[body_op], name='action_space')
            action_op = array_ops.squeeze(
                sampling_ops.epsilon_greedy(action_distribution,
                                            exploration_op, deterministic_ph))
            body_op = core.dense(body_op,
                                 units=PPOTest.hparams.value_units,
                                 activation=nn_ops.relu,
                                 use_bias=False)
            value_op = array_ops.squeeze(
                core.dense(body_op, units=1, use_bias=False), -1)
        policy_variables = variables.trainable_variables(scope='logits')

        # target
        with variable_scope.variable_scope('old_logits'):
            old_body_op = mlp(state_ph, PPOTest.hparams.hidden_layers)
            old_action_distribution, old_action_value_op = gym_ops.distribution_from_gym_space(
                env.action_space, logits=[old_body_op], name='action_space')
        assign_policy_op = shortcuts.assign_scope('logits', 'old_logits')

        # Setup the dataset
        stream = streams.Uniform.from_distributions(state_distribution,
                                                    action_distribution,
                                                    with_values=True)
        replay_dataset = dataset.ReplayDataset(
            stream, max_sequence_length=PPOTest.hparams.max_sequence_length)
        replay_dataset = replay_dataset.batch(PPOTest.hparams.batch_size)
        replay_op = replay_dataset.make_one_shot_iterator().get_next()

        action_ph = array_ops.placeholder(stream.action_dtype,
                                          [None, None] + stream.action_shape,
                                          name='action')
        value_ph = array_ops.placeholder(stream.reward_dtype,
                                         [None, None] + stream.reward_shape,
                                         name='value')
        reward_ph = array_ops.placeholder(stream.reward_dtype,
                                          [None, None] + stream.reward_shape,
                                          name='reward')
        terminal_ph = array_ops.placeholder(dtypes.bool, [None, None],
                                            name='terminal')
        sequence_length_ph = array_ops.placeholder(dtypes.int32, [None, 1],
                                                   name='sequence_length')
        sequence_length = array_ops.squeeze(sequence_length_ph, -1)

        # Setup the loss/optimization procedure
        advantage_op, return_op = ppo_ops.generalized_advantage_estimate(
            reward_ph,
            value_ph,
            sequence_length,
            max_sequence_length=PPOTest.hparams.max_sequence_length,
            weights=(1 - math_ops.cast(terminal_ph, reward_ph.dtype)),
            discount=PPOTest.hparams.discount,
            lambda_td=PPOTest.hparams.lambda_td)

        # actor loss
        logits_prob = action_distribution.log_prob(action_ph)
        old_logits_prob = old_action_distribution.log_prob(action_ph)
        ratio = math_ops.exp(logits_prob - old_logits_prob)
        clipped_ratio = clip_ops.clip_by_value(ratio,
                                               1. - PPOTest.hparams.epsilon,
                                               1. + PPOTest.hparams.epsilon)
        actor_loss_op = -math_ops.minimum(ratio * advantage_op,
                                          clipped_ratio * advantage_op)
        critic_loss_op = math_ops.square(
            value_op - return_op) * PPOTest.hparams.value_coeff
        entropy_loss_op = -action_distribution.entropy(
            name='entropy') * PPOTest.hparams.entropy_coeff
        loss_op = actor_loss_op + critic_loss_op + entropy_loss_op

        # total loss
        loss_op = math_ops.reduce_mean(
            math_ops.reduce_sum(loss_op, axis=-1) /
            math_ops.cast(sequence_length, loss_op.dtype))

        optimizer = adam.AdamOptimizer(
            learning_rate=PPOTest.hparams.learning_rate)
        train_op = optimizer.minimize(loss_op, var_list=policy_variables)
        train_op = control_flow_ops.cond(
            gen_math_ops.equal(
                gen_math_ops.mod(
                    ops.convert_to_tensor(PPOTest.hparams.assign_policy_steps,
                                          dtype=dtypes.int64),
                    (global_step + 1)), 0),
            lambda: control_flow_ops.group(*[train_op, assign_policy_op]),
            lambda: train_op)

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            sess.run(assign_policy_op)

            for iteration in range(PPOTest.hparams.num_iterations):
                rewards = gym_test_utils.rollout_with_values_on_gym_env(
                    sess,
                    env,
                    state_ph,
                    deterministic_ph,
                    action_value_op,
                    action_op,
                    value_op,
                    num_episodes=PPOTest.hparams.num_episodes,
                    stream=stream)

                while True:
                    try:
                        replay = sess.run(replay_op)
                    except (errors_impl.InvalidArgumentError,
                            errors_impl.OutOfRangeError):
                        break

                    _, loss = sess.run(
                        (train_op, loss_op),
                        feed_dict={
                            state_ph: replay.state,
                            action_ph: replay.action,
                            value_ph: replay.value,
                            reward_ph: replay.reward,
                            terminal_ph: replay.terminal,
                            sequence_length_ph: replay.sequence_length,
                        })
                    print(loss)

                rewards = gym_test_utils.rollout_on_gym_env(
                    sess,
                    env,
                    state_ph,
                    deterministic_ph,
                    action_value_op,
                    action_op,
                    num_episodes=PPOTest.hparams.num_episodes,
                    deterministic=True,
                    save_replay=False)
                print('average_rewards = {}'.format(
                    rewards / PPOTest.hparams.num_episodes))
Esempio n. 4
0
  def test_q_ops_quantile_dqn(self):
    env = gym.make('CartPole-v0')
    ops.reset_default_graph()
    np.random.seed(42)
    random_seed.set_random_seed(42)
    env.seed(42)

    # Setup the policy and model
    global_step = training_util.get_or_create_global_step()
    deterministic_ph = array_ops.placeholder(
        dtypes.bool, [], name='deterministic')
    exploration_op = learning_rate_decay.exponential_decay(
        QTest.hparams.initial_exploration,
        global_step,
        QTest.hparams.exploration_decay_steps,
        QTest.hparams.exploration_decay_rate)

    state_distribution, state_ph = gym_ops.distribution_from_gym_space(
        env.observation_space, name='state_space')
    action_distribution, _ = gym_ops.distribution_from_gym_space(
        env.action_space, name='action_space')

    # Setup the dataset
    stream = streams.Uniform.from_distributions(
        state_distribution, action_distribution)

    with variable_scope.variable_scope('logits'):
      action_value_op = mlp(state_ph, QTest.hparams.hidden_layers)
      action_value_op = core.dense(
          action_value_op,
          stream.action_value_shape[-1] * QTest.hparams.num_quantiles,
          use_bias=False)
      action_value_op_shape = array_ops.shape(action_value_op)
      action_value_shape = [
          action_value_op_shape[0],
          action_value_op_shape[1],
          stream.action_value_shape[-1],
          QTest.hparams.num_quantiles]
      action_value_op = gen_array_ops.reshape(action_value_op, action_value_shape)
      mean_action_value_op = math_ops.reduce_mean(action_value_op, axis=-1)
      action_op = math_ops.argmax(mean_action_value_op, axis=-1)
      action_op = array_ops.squeeze(action_op)
    policy_variables = variables.trainable_variables(scope='logits')


    next_state_ph = shortcuts.placeholder_like(state_ph, name='next_state_space')
    with variable_scope.variable_scope('targets'):
      target_next_action_value_op = mlp(next_state_ph, QTest.hparams.hidden_layers)
      target_next_action_value_op = core.dense(
          target_next_action_value_op,
          stream.action_value_shape[-1] * QTest.hparams.num_quantiles,
          use_bias=False)
      target_next_action_value_op_shape = array_ops.shape(target_next_action_value_op)
      target_next_action_value_shape = [
          target_next_action_value_op_shape[0],
          target_next_action_value_op_shape[1],
          stream.action_value_shape[-1],
          QTest.hparams.num_quantiles]
      target_next_action_value_op = gen_array_ops.reshape(
          target_next_action_value_op, target_next_action_value_shape)
      mean_target_next_action_value_op = math_ops.reduce_mean(
          target_next_action_value_op, axis=-1)
    assign_target_op = shortcuts.assign_scope('logits', 'target_logits')


    replay_dataset = dataset.ReplayDataset(
        stream, max_sequence_length=QTest.hparams.max_sequence_length)
    replay_dataset = replay_dataset.batch(QTest.hparams.batch_size)
    replay_op = replay_dataset.make_one_shot_iterator().get_next()


    action_ph = array_ops.placeholder(
        stream.action_dtype, [None, None] + stream.action_shape, name='action')
    reward_ph = array_ops.placeholder(
        stream.reward_dtype, [None, None] + stream.reward_shape, name='reward')
    terminal_ph = array_ops.placeholder(
        dtypes.bool, [None, None], name='terminal')
    sequence_length_ph = array_ops.placeholder(
        dtypes.int32, [None, 1], name='sequence_length')
    sequence_length = array_ops.squeeze(sequence_length_ph, -1)

    q_value_op, expected_q_value_op = q_ops.expected_q_value(
        array_ops.expand_dims(reward_ph, -1),
        action_ph,
        action_value_op,
        (target_next_action_value_op, mean_target_next_action_value_op),
        weights=array_ops.expand_dims(
            1 - math_ops.cast(terminal_ph, reward_ph.dtype), -1),
        discount=QTest.hparams.discount)

    u = expected_q_value_op - q_value_op
    loss_op = losses_impl.huber_loss(u, delta=QTest.hparams.huber_loss_delta)

    tau_op = (2. * math_ops.range(
        0, QTest.hparams.num_quantiles, dtype=u.dtype) + 1) / (
            2. * QTest.hparams.num_quantiles)

    loss_op *= math_ops.abs(tau_op - math_ops.cast(u < 0, tau_op.dtype))
    loss_op = math_ops.reduce_mean(loss_op, axis=-1)

    loss_op = math_ops.reduce_mean(
        math_ops.reduce_sum(loss_op, axis=-1) / math_ops.cast(
            sequence_length, loss_op.dtype))
    optimizer = adam.AdamOptimizer(
        learning_rate=QTest.hparams.learning_rate)
    train_op = optimizer.minimize(loss_op, var_list=policy_variables)
    train_op = control_flow_ops.cond(
        gen_math_ops.equal(
            gen_math_ops.mod(
                ops.convert_to_tensor(
                    QTest.hparams.assign_target_steps, dtype=dtypes.int64),
                (global_step + 1)), 0),
        lambda: control_flow_ops.group(*[train_op, assign_target_op]),
        lambda: train_op)

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      sess.run(assign_target_op)

      for iteration in range(QTest.hparams.num_iterations):
        rewards = gym_test_utils.rollout_on_gym_env(
            sess, env, state_ph, deterministic_ph,
            mean_action_value_op, action_op,
            num_episodes=QTest.hparams.num_episodes,
            stream=stream)

        while True:
          try:
            replay = sess.run(replay_op)
          except (errors_impl.InvalidArgumentError, errors_impl.OutOfRangeError):
            break
          loss, _ = sess.run(
              (loss_op, train_op),
              feed_dict={
                state_ph: replay.state,
                next_state_ph: replay.next_state,
                action_ph: replay.action,
                reward_ph: replay.reward,
                terminal_ph: replay.terminal,
                sequence_length_ph: replay.sequence_length,
              })

        rewards = gym_test_utils.rollout_on_gym_env(
            sess, env, state_ph, deterministic_ph,
            mean_action_value_op, action_op,
            num_episodes=QTest.hparams.num_episodes,
            deterministic=True, save_replay=False)
        print('average_rewards = {}'.format(rewards / QTest.hparams.num_episodes))