def testLaplacian1D(self):
        action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32,
                                                    shape=(),
                                                    minimum=0,
                                                    maximum=4)
        laplacian_matrix = tf.convert_to_tensor(
            utils.build_laplacian_over_ordinal_integer_actions(action_spec),
            dtype=tf.float32)
        res = tf.matmul(laplacian_matrix, tf.ones([5, 1], dtype=tf.float32))
        # The vector of ones is in the null space of the Laplacian matrix.
        self.assertAllClose(0.0, self.evaluate(tf.norm(res)))

        # The row sum is zero.
        row_sum = tf.reduce_sum(laplacian_matrix, 1)
        self.assertAllClose(0.0, self.evaluate(tf.norm(row_sum)))

        # The column sum is zero.
        column_sum = tf.reduce_sum(laplacian_matrix, 0)
        self.assertAllClose(0.0, self.evaluate(tf.norm(column_sum)))

        # The diagonal elements are 2.0.
        self.assertAllClose(2.0, laplacian_matrix[1, 1])

        laplacian_matrix_expected = np.array([[1.0, -1.0, 0.0, 0.0, 0.0],
                                              [-1.0, 2.0, -1.0, 0.0, 0.0],
                                              [0.0, -1.0, 2.0, -1.0, 0.0],
                                              [0.0, 0.0, -1.0, 2.0, -1.0],
                                              [0.0, 0.0, 0.0, -1.0, 1.0]])
        self.assertAllClose(laplacian_matrix_expected,
                            self.evaluate(laplacian_matrix))
Beispiel #2
0
def build_laplacian_over_ordinal_integer_actions_from_env(env):
    return utils.build_laplacian_over_ordinal_integer_actions(
        env.action_spec())
def main(unused_argv):
    tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.structured_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            laplacian_matrix = utils.build_laplacian_over_ordinal_integer_actions(
                environment.action_spec())

            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=REWARD_NETWORK_LAYER_PARAMS)
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=NN_LEARNING_RATE),
                epsilon=EPSILON,
                laplacian_matrix=laplacian_matrix,
                laplacian_smoothing_weight=0.01)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])