def run_specific(): # This function runs the experiment for some specific parameters # Experiment parameters experiment_parameters = {"max_steps": 300000, "num_runs": 50} # Environment parameters environment_parameters = {} # Agent parameters # Each element is an array because we will be later sweeping over multiple values # actor and critic step-sizes are divided by num. tilings inside the agent agent_parameters = { "num_tilings": [32], "num_tiles": [8], "actor_step_size": [2**(-2)], "critic_step_size": [2**1], "avg_reward_step_size": [2**(-6)], "num_actions": 3, "iht_size": 4096 } current_env = PendulumEnvironment current_agent = ActorCriticSoftmaxAgent run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters) plot_script.plot_result(agent_parameters, 'results_actor_critic')
"start_state": 250, "left_terminal_state": 0, "right_terminal_state": 501, "discount_factor": 1.0 } # Agent parameters # Each element is an array because we will be later sweeping over multiple values agent_parameters = {"num_groups": [10], "step_size": [0.01, 0.05, 0.1]} current_env = RandomWalkEnvironment current_agent = TDAgent run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters) plot_script.plot_result(agent_parameters, 'results') # Is the learned state value plot with step-size=0.01 similar to Figure 9.2 (p.208) in Sutton and Barto? # # (Note that our environment has less states: 500 states and we have done 2000 episodes, and averaged the performance over 50 runs) # # Look at the plot of the learning curve. Does RMSVE decrease over time? # # Would it be possible to reduce RMSVE to 0? # # You should see the RMSVE decrease over time, but the error seems to plateau. It is impossible to reduce RMSVE to 0, because of function approximation (and we do not decay the step-size parameter to zero). With function approximation, the agent has limited resources and has to trade-off the accuracy of one state for another state. # Run the following code to verify your experimental result. # In[34]:
agent_parameters = { "num_tilings": [32], "num_tiles": [8], "actor_step_size": [2**(-2)], "critic_step_size": [2**1], "avg_reward_step_size": [2**(-6)], "num_actions": 3, "iht_size": 4096 } current_env = PendulumEnvironment current_agent = ActorCriticSoftmaxAgent run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters) plot_script.plot_result(agent_parameters, 'results') # Run the following code to verify your experimental result. # In[25]: # --------------- # Discussion Cell # --------------- ## Test Code for experimental result ## filename = 'ActorCriticSoftmax_tilings_32_tiledim_8_actor_ss_0.25_critic_ss_2_avg_reward_ss_0.015625_exp_avg_reward' agent_exp_avg_reward = np.load('results/{}.npy'.format(filename), allow_pickle=True) result_med = np.median(agent_exp_avg_reward, axis=0)
# for batchsz in [1, 10, 100, 1000]: # print(batchsz) # params['batch_size'] = batchsz # nm = '' # params['name'] = f'Batchsize {batchsz}' env_infos = { 'States: only walls': { 'state_space': 'no body knowledge' }, 'States: direction 0 or 1': { 'state_space': '' }, 'States: coordinates': { 'state_space': 'coordinates' }, 'States: no direction': { 'state_space': 'no direction' } } # for key in env_infos.keys(): # params['name'] = key # env_info = env_infos[key] # print(env_info) # env = Snake(env_info=env_info) env = Snake() sum_of_rewards = train_dqn(ep, env) results[params['name']] = sum_of_rewards plot_result(results, direct=True, k=20)
environment_parameters = {} current_env = LunarLanderEnvironment # Agent parameters agent_parameters = { 'network_config': { 'state_dim': 8, 'num_hidden_units': 256, 'num_actions': 4 }, 'optimizer_config': { 'step_size': 1e-3, 'beta_m': 0.9, 'beta_v': 0.999, 'epsilon': 1e-8 }, 'replay_buffer_size': 50000, 'minibatch_sz': 8, 'num_replay_updates_per_step': 4, 'gamma': 0.99, 'tau': 0.001 } current_agent = Agent # run experiment run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters) plot_result(["expected_sarsa_agent", "random_agent"])
"num_hidden_units": 100, "step_size": 0.001, "beta_m": 0.9, "beta_v": 0.999, "epsilon": 0.0001, } current_env = RandomWalkEnvironment current_agent = TDAgent # run experiment run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters) # plot result plot_script.plot_result(["td_agent"]) shutil.make_archive('results', 'zip', 'results') # You plotted the learning curve for 1000 episodes. As you can see the RMSVE is still decreasing. Here we provide the pre-computed result for 5000 episodes and 20 runs so that you can see the performance of semi-gradient TD with a neural network after being trained for a long time. # # ![](nn_5000_episodes.png) # # Does semi-gradient TD with a neural network find a good approximation within 5000 episodes? # # As you may remember from the previous assignment, semi-gradient TD with 10-state aggregation converged within 100 episodes. Why is TD with a neural network slower? # # Would it be faster if we decrease the number of hidden units? Or what about if we increase the number of hidden units? # ## 2-2: Compare Performance of Semi-gradient TD with a Neural Network and Semi-gradient TD with Tile-coding