Exemple #1
0
def main(unused_argv):
    tf.enable_resource_variables()

    with tf.device('/CPU:0'):  # due to b/128333994
        env = wheel_py_environment.WheelPyEnvironment(DELTA, MU_BASE, STD_BASE,
                                                      MU_HIGH, STD_HIGH,
                                                      BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_reward,
            delta=DELTA,
            mu_inside=MU_BASE[0],
            mu_high=MU_HIGH)
        optimal_action_fn = functools.partial(
            environment_utilities.tf_wheel_bandit_compute_optimal_action,
            delta=DELTA)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'epsGreedy':
            network = q_network.QNetwork(
                input_tensor_spec=environment.time_step_spec().observation,
                action_spec=environment.action_spec(),
                fc_layer_params=LAYERS)
            agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                reward_network=network,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
                epsilon=EPSILON)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
Exemple #2
0
def main(unused_argv):
    tf.enable_resource_variables()

    with tf.device('/CPU:0'):  # due to b/128333994
        observation_shape = [CONTEXT_DIM]
        overall_shape = [BATCH_SIZE] + observation_shape
        observation_distribution = tfd.Normal(loc=tf.zeros(overall_shape),
                                              scale=tf.ones(overall_shape))
        action_shape = [NUM_ACTIONS]
        observation_to_reward_shape = observation_shape + action_shape
        observation_to_reward_distribution = tfd.Normal(
            loc=tf.zeros(observation_to_reward_shape),
            scale=tf.ones(observation_to_reward_shape))
        drift_distribution = tfd.Normal(loc=DRIFT_MEAN, scale=DRIFT_VARIANCE)
        additive_reward_distribution = tfd.Normal(
            loc=tf.zeros(action_shape),
            scale=(REWARD_NOISE_VARIANCE * tf.ones(action_shape)))
        environment_dynamics = dle.DriftingLinearDynamics(
            observation_distribution, observation_to_reward_distribution,
            drift_distribution, additive_reward_distribution)
        environment = nse.NonStationaryStochasticEnvironment(
            environment_dynamics)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                gamma=0.95,
                emit_log_probability=False,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                gamma=0.95,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(
            environment.environment_dynamics.compute_optimal_reward)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            environment.environment_dynamics.compute_optimal_action)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
Exemple #3
0
def main(unused_argv):
    tf.compat.v1.enable_resource_variables()

    with tf.device('/CPU:0'):  # due to b/128333994
        action_reward_fns = (
            environment_utilities.sliding_linear_reward_fn_generator(
                CONTEXT_DIM, NUM_ACTIONS, REWARD_NOISE_VARIANCE))

        env = sspe.StationaryStochasticPyEnvironment(functools.partial(
            environment_utilities.context_sampling_fn,
            batch_size=BATCH_SIZE,
            context_dim=CONTEXT_DIM),
                                                     action_reward_fns,
                                                     batch_size=BATCH_SIZE)
        environment = tf_py_environment.TFPyEnvironment(env)

        optimal_reward_fn = functools.partial(
            environment_utilities.tf_compute_optimal_reward,
            per_action_reward_fns=action_reward_fns)

        optimal_action_fn = functools.partial(
            environment_utilities.tf_compute_optimal_action,
            per_action_reward_fns=action_reward_fns)

        if FLAGS.agent == 'LinUCB':
            agent = lin_ucb_agent.LinearUCBAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)
        elif FLAGS.agent == 'LinTS':
            agent = lin_ts_agent.LinearThompsonSamplingAgent(
                time_step_spec=environment.time_step_spec(),
                action_spec=environment.action_spec(),
                alpha=AGENT_ALPHA,
                dtype=tf.float32)

        regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
        suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
            optimal_action_fn)

        trainer.train(
            root_dir=FLAGS.root_dir,
            agent=agent,
            environment=environment,
            training_loops=TRAINING_LOOPS,
            steps_per_loop=STEPS_PER_LOOP,
            additional_metrics=[regret_metric, suboptimal_arms_metric])
Exemple #4
0
    def testTrainerTF1ExportsCheckpoints(self, num_actions, observation_shape,
                                         action_shape, batch_size,
                                         training_loops, steps_per_loop,
                                         learning_rate):
        """Tests TF1 trainer code, checks that expected checkpoints are exported."""
        root_dir = tempfile.mkdtemp(dir=os.getenv('TEST_TMPDIR'))
        environment = get_bounded_reward_random_environment(
            observation_shape, action_shape, batch_size, num_actions)
        agent = exp3_agent.Exp3Agent(
            learning_rate=learning_rate,
            time_step_spec=environment.time_step_spec(),
            action_spec=environment.action_spec())

        trainer.train(root_dir, agent, environment, training_loops,
                      steps_per_loop)
        latest_checkpoint = tf.train.latest_checkpoint(
            os.path.join(root_dir, 'train'))
        expected_checkpoint_regex = '.*ckpt-{}'.format(
            training_loops * batch_size * steps_per_loop)
        self.assertRegex(latest_checkpoint, expected_checkpoint_regex)