def test_run_with_callbacks(self): # Set up environment and policy env = gym.make('MountainCar-v0') pol = vcf.DiscreteRandomControl(env.action_space.n) # Define an agent phi_1 = vcf.BiasUnit() td_1 = vcf.TD(len(phi_1)) params_1 = { 'alpha': 0.01, 'gm': 0.999, 'gm_p': 0.999, 'lm': 0.1, } agent_1 = vcf.Agent(td_1, phi_1, params_1) agents = [agent_1] # Set up the experiment experiment = vcf.LiveExperiment(env, pol, agents) # Set up testing callbacks cbk = _CheckCallback() # Try running the experiment num_eps = 10 max_steps = 10 experiment.run(num_eps, max_steps, callbacks=[cbk]) # Check that the callbacks ran properly assert (cbk.experiment_begin > 0) assert (cbk.experiment_end > 0) assert (cbk.episode_begin > 0) assert (cbk.episode_end > 0) assert (cbk.step_begin > 0) assert (cbk.step_end > 0)
def test_run(self): # Set up environment and policy env = gym.make('MountainCar-v0') pol = vcf.DiscreteRandomControl(env.action_space.n) # Define an agent phi_1 = vcf.BiasUnit() td_1 = vcf.TD(len(phi_1)) params_1 = { 'alpha': 0.01, 'gm': 0.999, 'gm_p': 0.999, 'lm': 0.1, } agent_1 = vcf.Agent(td_1, phi_1, params_1) agents = [agent_1] # Set up the experiment experiment = vcf.LiveExperiment(env, pol, agents) # Try running the experiment num_eps = 10 max_steps = 10 experiment.run(num_eps, max_steps, callbacks=[])
def test_terminal_context(self): # Set up the agent param_funcs = { 'alpha': 0.05, 'gm': vcf.Constant(0.9999, 0), 'gm_p': vcf.Constant(0.9999, 0), 'lm': 0.1 } phi = vcf.BinaryVector(10) algo = vcf.TD(len(phi)) agent = vcf.Agent(algo, phi, param_funcs) # No base context base_ctx = {} term_ctx = agent.terminal_context(base_ctx) assert (isinstance(term_ctx, dict)) assert (term_ctx['done'] == True) assert (term_ctx['r'] == 0) assert (all(term_ctx['xp'] == 0)) # Nonsense base context (should still be present) base_ctx = {'__' + str(i): i**2 for i in range(10)} term_ctx = agent.terminal_context(base_ctx) assert (isinstance(term_ctx, dict)) assert (term_ctx['done'] == True) assert (term_ctx['r'] == 0) assert (all(term_ctx['xp'] == 0)) assert (all(key in term_ctx for key in base_ctx.keys())) assert (term_ctx[key] == val for key, val in base_ctx.items())
def test_setup(self): # Set up the agent param_funcs = { 'alpha': 0.05, 'gm': vcf.Constant(0.9999, 0), 'gm_p': vcf.Constant(0.9999, 0), 'lm': 0.1 } phi = vcf.BinaryVector(10) algo = vcf.TD(len(phi)) agent = vcf.Agent(algo, phi, param_funcs)
'gm_p': vcf.Constant(0.999, 0), 'lm': vcf.Constant(0.01, 0), } q_phi = vcf.BinaryVector(ns) q_algo = vcf.DiscreteQ(len(q_phi), na, epsilon=0.05) control = vcf.Agent(q_algo, q_phi, q_params) # Define some other agents that simply learn the value function phi1 = vcf.BinaryVector(ns) td_params = { 'alpha': vcf.Constant(0.01), 'gm': vcf.Constant(0.999, 0), 'gm_p': vcf.Constant(0.999, 0), 'lm': vcf.Constant(0.01, 0), } td_agent1 = vcf.Agent(vcf.TD(len(phi1)), phi1, td_params) phi2 = vcf.BiasUnit() td_params2 = { 'alpha': 0.01, 'gm': 0.9, 'gm_p': 0.9, 'lm': 0.9, } td_agent2 = vcf.Agent(vcf.TD(len(phi2)), phi2, td_params2) # Define the agents to update agents = [control, td_agent1, td_agent2] # Set up the experiment experiment = vcf.PolicyEvaluation(env, control, agents=agents)
tiling_3 = vcf.UniformTiling(env.observation_space, 11) tiling_4 = vcf.UniformTiling(env.observation_space, 19) # Convert tile indices to binary vector bvec_1 = vcf.BinaryVector(tiling_1.high, tiling_1) bvec_2 = vcf.BinaryVector(tiling_2.high, tiling_2) bvec_3 = vcf.BinaryVector(tiling_3.high, tiling_3) bvec_4 = vcf.BinaryVector(tiling_4.high, tiling_4) # Concatenate binary vectors phi = vcf.Union(bias_unit, bvec_1, bvec_2, bvec_3, bvec_4) # Set up agents nf = len(phi) na = env.action_space.n # Control agent, value function learning, delta agent, delta-squared agent control_agent = vcf.DiscreteQ(nf, na, epsilon=0.05) value_agent = vcf.TD(nf) delta_agent = vcf.TD(nf) square_agent = vcf.TD(nf) # Zero value initialization control_agent.w *= 0 control_agent.w += np.random.normal(0, 1, control_agent.w.shape) # Fixed parameters alpha_0 = 0.05 gamma = 0.999 lmbda = 0.0 # Set up tracking episodes = [] stepcount = []