def run_specific():
    # This function runs the experiment for some specific parameters
    # Experiment parameters
    experiment_parameters = {"max_steps": 300000, "num_runs": 50}

    # Environment parameters
    environment_parameters = {}

    # Agent parameters
    #   Each element is an array because we will be later sweeping over multiple values
    # actor and critic step-sizes are divided by num. tilings inside the agent
    agent_parameters = {
        "num_tilings": [32],
        "num_tiles": [8],
        "actor_step_size": [2**(-2)],
        "critic_step_size": [2**1],
        "avg_reward_step_size": [2**(-6)],
        "num_actions": 3,
        "iht_size": 4096
    }

    current_env = PendulumEnvironment
    current_agent = ActorCriticSoftmaxAgent

    run_experiment(current_env, current_agent, environment_parameters,
                   agent_parameters, experiment_parameters)
    plot_script.plot_result(agent_parameters, 'results_actor_critic')
    "start_state": 250,
    "left_terminal_state": 0,
    "right_terminal_state": 501,
    "discount_factor": 1.0
}

# Agent parameters
# Each element is an array because we will be later sweeping over multiple values
agent_parameters = {"num_groups": [10], "step_size": [0.01, 0.05, 0.1]}

current_env = RandomWalkEnvironment
current_agent = TDAgent

run_experiment(current_env, current_agent, environment_parameters,
               agent_parameters, experiment_parameters)
plot_script.plot_result(agent_parameters, 'results')

# Is the learned state value plot with step-size=0.01 similar to Figure 9.2 (p.208) in Sutton and Barto?
#
# (Note that our environment has less states: 500 states and we have done 2000 episodes, and averaged the performance over 50 runs)
#
# Look at  the plot of the learning curve. Does RMSVE decrease over time?
#
# Would it be possible to reduce RMSVE to 0?
#
# You should see the RMSVE decrease over time, but the error seems to plateau. It is impossible to reduce RMSVE to 0, because of function approximation (and we do not decay the step-size parameter to zero). With function approximation, the agent has limited resources and has to trade-off the accuracy of one state for another state.

# Run the following code to verify your experimental result.

# In[34]:
Example #3
0
agent_parameters = {
    "num_tilings": [32],
    "num_tiles": [8],
    "actor_step_size": [2**(-2)],
    "critic_step_size": [2**1],
    "avg_reward_step_size": [2**(-6)],
    "num_actions": 3,
    "iht_size": 4096
}

current_env = PendulumEnvironment
current_agent = ActorCriticSoftmaxAgent

run_experiment(current_env, current_agent, environment_parameters,
               agent_parameters, experiment_parameters)
plot_script.plot_result(agent_parameters, 'results')

# Run the following code to verify your experimental result.

# In[25]:

# ---------------
# Discussion Cell
# ---------------

## Test Code for experimental result ##
filename = 'ActorCriticSoftmax_tilings_32_tiledim_8_actor_ss_0.25_critic_ss_2_avg_reward_ss_0.015625_exp_avg_reward'
agent_exp_avg_reward = np.load('results/{}.npy'.format(filename),
                               allow_pickle=True)
result_med = np.median(agent_exp_avg_reward, axis=0)
Example #4
0
    # for batchsz in [1, 10, 100, 1000]:
    #     print(batchsz)
    #     params['batch_size'] = batchsz
    #     nm = ''
    #     params['name'] = f'Batchsize {batchsz}'
    env_infos = {
        'States: only walls': {
            'state_space': 'no body knowledge'
        },
        'States: direction 0 or 1': {
            'state_space': ''
        },
        'States: coordinates': {
            'state_space': 'coordinates'
        },
        'States: no direction': {
            'state_space': 'no direction'
        }
    }

    # for key in env_infos.keys():
    #     params['name'] = key
    #     env_info = env_infos[key]
    #     print(env_info)
    #     env = Snake(env_info=env_info)
    env = Snake()
    sum_of_rewards = train_dqn(ep, env)
    results[params['name']] = sum_of_rewards

    plot_result(results, direct=True, k=20)
Example #5
0
environment_parameters = {}

current_env = LunarLanderEnvironment

# Agent parameters
agent_parameters = {
    'network_config': {
        'state_dim': 8,
        'num_hidden_units': 256,
        'num_actions': 4
    },
    'optimizer_config': {
        'step_size': 1e-3,
        'beta_m': 0.9, 
        'beta_v': 0.999,
        'epsilon': 1e-8
    },
    'replay_buffer_size': 50000,
    'minibatch_sz': 8,
    'num_replay_updates_per_step': 4,
    'gamma': 0.99,
    'tau': 0.001
}
current_agent = Agent

# run experiment
run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters)


plot_result(["expected_sarsa_agent", "random_agent"])
Example #6
0
    "num_hidden_units": 100,
    "step_size": 0.001,
    "beta_m": 0.9,
    "beta_v": 0.999,
    "epsilon": 0.0001,
}

current_env = RandomWalkEnvironment
current_agent = TDAgent

# run experiment
run_experiment(current_env, current_agent, environment_parameters,
               agent_parameters, experiment_parameters)

# plot result
plot_script.plot_result(["td_agent"])

shutil.make_archive('results', 'zip', 'results')

# You plotted the learning curve for 1000 episodes. As you can see the RMSVE is still decreasing. Here we provide the pre-computed result for 5000 episodes and 20 runs so that you can see the performance of semi-gradient TD with a neural network after being trained for a long time.
#
# ![](nn_5000_episodes.png)
#

# Does semi-gradient TD with a neural network find a good approximation within 5000 episodes?
#
# As you may remember from the previous assignment, semi-gradient TD with 10-state aggregation converged within 100 episodes. Why is TD with a neural network slower?
#
# Would it be faster if we decrease the number of hidden units? Or what about if we increase the number of hidden units?

# ## 2-2: Compare Performance of Semi-gradient TD with a Neural Network and Semi-gradient TD with Tile-coding