def example_1(): """ Example 1: Compares rewards and percentage of optimum action selection between various methods, using the pit run_mode of Arena. Produces the results in the form of two plots. """ # Initialises the Arena and all required inputs. arena = Arena('base_problem') actions_list = [10] timesteps_list = [1000] runs_list = [2000] init_mean_list = [0] init_stddev_list = [1] action_stddev_list = [1] delta_mean_list = [0] delta_stddev_list = [0] first_considered_reward_step_list = [0] # Creates and adds Bandits to the Arena. arena.add_bandits([Bandit(*val) for val in zip(actions_list, timesteps_list, runs_list, first_considered_reward_step_list, \ init_mean_list, init_stddev_list, action_stddev_list, delta_mean_list, delta_stddev_list)]) # Creates and adds Players to the Arena. arena.add_players([ RandomPlayer(), QPlayer(initial_Q=0, epsilon=0.1), QPlayer(initial_Q=5, epsilon=0.1), UCBQPlayer(initial_Q=0, confidence_level=2), UCBQPlayer(initial_Q=5, confidence_level=2), GradientPlayer(step_size_parameter=0.1, use_baseline_reward=True), GradientPlayer(step_size_parameter=0.1, use_baseline_reward=False) ]) # Run the Arena in pit mode. arena.run('pit')
def example_2(): """ Example 2: Parameter study of various Players on a nonstationary Bandit. Produces the results in the form of a single plot. For a stationary Bandit, set all values of delta_mean_list and delta_stddev_list to 0. """ # Initialises the Arena and all required inputs. arena = Arena('base_problem') actions_list = [10] timesteps_list = [1000] runs_list = [2000] init_mean_list = [0] init_stddev_list = [1] action_stddev_list = [1] delta_mean_list = [0] delta_stddev_list = [0.01] first_considered_reward_step_list = [0] # Initialises the study ranges for all Players. epsilon_study_range = np.logspace(-7, -1, num=7, base=2.0, dtype=float).tolist() initial_Q_study_range = np.logspace(-2, 3, num=6, base=2.0, dtype=float).tolist() confidence_level_study_range = np.logspace(-4, 3, num=8, base=2.0, dtype=float).tolist() step_size_parameter_study_range = np.logspace(-5, 2, num=8, base=2.0, dtype=float).tolist() parameter_range = np.logspace(-8, 4, num=2, base=2.0, dtype=float).tolist() # Creates and adds Bandits to the Arena. arena.add_bandits([Bandit(*val) for val in zip(actions_list, timesteps_list, runs_list, first_considered_reward_step_list, \ init_mean_list, init_stddev_list, action_stddev_list, delta_mean_list, delta_stddev_list)]) # Creates and adds Players to the Arena. arena.add_players([ QPlayer(0, epsilon_study_range[0], study_variable='epsilon', study_range=epsilon_study_range ), # epsilon greedy, intial_q = 0 (study epsilon) QPlayer( 0, epsilon_study_range[0], 0.1, study_variable='epsilon', study_range=epsilon_study_range ), # epsilon greedy with alpha 0.1, initial_Q = 0 (study epsilon) QPlayer(initial_Q_study_range[0], 0, 0.1, study_variable='initial_Q', study_range=initial_Q_study_range ), # greedy with alpha 0.1 (study initial_Q) UCBQPlayer(0, confidence_level_study_range[0], study_variable='confidence_level', study_range=confidence_level_study_range ), # UCB, initial_Q = 0 (study ucb_c) UCBQPlayer(0, confidence_level_study_range[0], 0.1, study_variable='confidence_level', study_range=confidence_level_study_range ), # UCB, initial_Q = 0, alpha=0.1 (study ucb_c) GradientPlayer(step_size_parameter_study_range[0], study_variable='step_size_parameter', study_range=step_size_parameter_study_range) ]) # gradient bandit with baseline (study alpha) # Run the Arena in parameter study mode. arena.run('parameter_study', parameter_range)