def test_collect_Q(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = Parameter(0.1) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(pi, mdp.info, alpha) callback_q = CollectQ(agent.Q) callback_max_q = CollectMaxQ(agent.Q, np.array([2])) core = Core(agent, mdp, callbacks=[callback_q, callback_max_q]) core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True) V_test = np.array([2.4477574 , 0.02246188, 1.6210059 , 6.01867052]) V = callback_q.get()[-1] assert np.allclose(V[0, :], V_test) V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()]) max_q = np.array(callback_max_q.get()) assert np.allclose(V_max, max_q)
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=10000, n_steps_per_fit=1)
def experiment(boosted): np.random.seed(20) # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Approximator if not boosted: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) else: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_models=3, prediction='sum', n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = FQI(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True) # Test test_epsilon = Parameter(0) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((9, 2)) cont = 0 for i in range(-8, 9, 8): for j in range(-8, 9, 8): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(policy, value): np.random.seed(45) # MDP mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5)) # Policy pi = policy(Parameter(value=value)) # Agent learning_rate = Parameter(value=.15) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = SARSA(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train n_steps = 2000 core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(approximator, pi, mdp.info, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Core core = Core(agent, mdp) # Initial policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J start:', J) # Train core.learn(n_steps=10000, n_steps_per_fit=1) # Final Policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J final:', J)
def experiment2(): np.random.seed(3) print('mushroom :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() return agent.Q.table
def test_dataset_utils(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) epsilon = Parameter(value=0.) alpha = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) agent = SARSA(pi, mdp.info, alpha) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=10) J = compute_J(dataset, mdp.info.gamma) J_test = np.array([ 1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01, 1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00, 2.28767925e+00, 4.23911583e-01 ]) assert np.allclose(J, J_test) L = episodes_length(dataset) L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31]) assert np.array_equal(L, L_test) dataset_ep = select_first_episodes(dataset, 3) J = compute_J(dataset_ep, mdp.info.gamma) assert np.allclose(J, J_test[:3]) L = episodes_length(dataset_ep) assert np.allclose(L, L_test[:3]) samples = select_random_samples(dataset, 2) s, a, r, ss, ab, last = parse_dataset(samples) s_test = np.array([[6.], [1.]]) a_test = np.array([[0.], [1.]]) r_test = np.zeros(2) ss_test = np.array([[3], [4]]) ab_test = np.zeros(2) last_test = np.zeros(2) assert np.array_equal(s, s_test) assert np.array_equal(a, a_test) assert np.array_equal(r, r_test) assert np.array_equal(ss, ss_test) assert np.array_equal(ab, ab_test) assert np.array_equal(last, last_test) index = np.sum(L_test[:2]) + L_test[2] // 2 min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index], mdp.info.gamma) assert min_J == 0.0 assert max_J == 0.0011610630703530948 assert mean_J == 0.0005805315351765474 assert n_episodes == 2
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps / n_episodes)) print('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def test_rq_learning(): pi, mdp, _ = initialize() agent = RQLearning(pi, mdp.info, Parameter(.1), beta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.32411217, 2.9698436, 0.46474438, 1.10269504], [2.99505139, 5.217031, 0.40933461, 0.37687883], [0.41942675, 0.32363486, 0., 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(pi, mdp.info, Parameter(.1), delta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[1.04081115e-2, 5.14662188e-1, 1.73951634e-2, 1.24081875e-01], [0., 2.71, 1.73137500e-4, 4.10062500e-6], [0., 4.50000000e-2, 0., 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(pi, mdp.info, Parameter(.1), off_policy=True, beta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[3.55204022, 4.54235939, 3.42601165, 2.95170908], [2.73877031, 3.439, 2.42031528, 2.86634531], [3.43274708, 3.8592342, 3.72637395, 5.217031], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(pi, mdp.info, Parameter(.1), off_policy=True, delta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.18947806, 1.57782254, 0.21911489, 1.05197011], [0.82309759, 5.217031, 0.04167492, 0.61472604], [0.23620541, 0.59828262, 1.25299991, 5.217031], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Render core.evaluate(n_episodes=1, render=True) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def test_r_learning(): pi, mdp, _ = initialize() agent = RLearning(pi, mdp.info, Parameter(.1), Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[-6.19137991, -3.9368055, -5.11544257, -3.43673781], [-2.52319391, 1.92201829, -2.77602918, -2.45972955], [-5.38824415, -2.43019918, -1.09965936, 2.04202511], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_r_learning(): alg = RLearning(pi, mdp.info, Parameter(.1), Parameter(.5)) alg.Q.table = np.arange(np.prod(mdp.info.size)).reshape( mdp.info.size).astype(np.float) alg._update(0, 1, 100, 1, 0) alg._update(1, 0, 10, 3, 1) alg._update(3, 1, 50, 3, 0) alg._update(2, 2, -100, 3, 1) test_q = np.array([[0, 11.6, 2, 3], [-.17, 5, 6, 7], [8, 9, -5.77, 11], [12, 13.43, 14, 15]]) assert np.allclose(alg.Q.table, test_q)
def test_lspi(): mdp = CartPole() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=dict(), approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) w = agent.approximator.get_weights() w_test = np.array([-2.23880597, -2.27427603, -2.25]) assert np.allclose(w, w_test)
def test_sarsa_lambda_continuous_nn(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(TorchApproximator, pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884, -0.12784956, -0.10572472, -0.14546978, -0.67001086, -0.93925357]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(LinearApproximator, pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0., -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_boltzmann(): np.random.seed(88) beta = Parameter(0.1) pi = Boltzmann(beta) Q = Table((10, 3)) Q.table = np.random.randn(10, 3) pi.set_q(Q) s = np.array([2]) a = np.array([1]) p_s = pi(s) p_s_test = np.array([0.30676679, 0.36223227, 0.33100094]) assert np.allclose(p_s, p_s_test) p_sa = pi(s, a) p_sa_test = np.array([0.36223227]) assert np.allclose(p_sa, p_sa_test) a = pi.draw_action(s) a_test = 2 assert a.item() == a_test beta_2 = LinearParameter(0.2, 0.1, 2) pi.set_beta(beta_2) p_sa_2 = pi(s, a) assert p_sa_2 < p_sa pi.update(s, a) p_sa_3 = pi(s, a) p_sa_3_test = np.array([0.33100094]) assert np.allclose(p_sa_3, p_sa_3_test)
def test_sarsa_lambda_continuous(): n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) alg = SARSALambdaContinuous(LinearApproximator, pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) s_1 = np.linspace(mdp_continuous.info.observation_space.low[0], mdp_continuous.info.observation_space.high[0], 10) s_2 = np.linspace(mdp_continuous.info.observation_space.low[1], mdp_continuous.info.observation_space.high[1], 10) for i in s_1: for j in s_2: alg._update(np.array([i, j]), np.array([1]), 100, np.array([0, 0]), 0) test_w = np.array([[0, 0, 0, 0], [320.43, 399.8616, 340.397, 417.218], [0, 0, 0, 0]]) assert np.allclose(alg.Q.get_weights(), test_w.ravel())
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = TrueOnlineSARSALambda(pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0., -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(algorithm_class, decay_exp): np.random.seed() # MDP p = np.load('chain_structure/p.npy') rew = np.load('chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get_values() return Qs
def test_true_online_sarsa_lambda(): n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) alg = TrueOnlineSARSALambda(pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) s_1 = np.linspace(mdp_continuous.info.observation_space.low[0], mdp_continuous.info.observation_space.high[0], 10) s_2 = np.linspace(mdp_continuous.info.observation_space.low[1], mdp_continuous.info.observation_space.high[1], 10) for i in s_1: for j in s_2: alg._update(np.array([i, j]), np.array([1]), 100, np.array([0, 0]), 0) test_w = np.array([[0, 0, 0, 0], [927.0283, 798.57594, 876.8018, 705.227], [0, 0, 0, 0]]) assert np.allclose(alg.Q.get_weights(), test_w.ravel())
def test_eps_greedy(): np.random.seed(88) eps = Parameter(0.1) pi = EpsGreedy(eps) Q = Table((10, 3)) Q.table = np.random.randn(10, 3) pi.set_q(Q) s = np.array([2]) a = np.array([1]) p_s = pi(s) p_s_test = np.array([0.03333333, 0.93333333, 0.03333333]) assert np.allclose(p_s, p_s_test) p_sa = pi(s, a) p_sa_test = np.array([0.93333333]) assert np.allclose(p_sa, p_sa_test) a = pi.draw_action(s) a_test = 1 assert a.item() == a_test eps_2 = LinearParameter(0.2, 0.1, 2) pi.set_epsilon(eps_2) p_sa_2 = pi(s, a) assert p_sa_2 < p_sa pi.update(s, a) pi.update(s, a) p_sa_3 = pi(s, a) print(eps_2.get_value()) assert p_sa_3 == p_sa
def experiment(): np.random.seed() # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=fit_params, approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset))
def __init__(self, epsilon=None): if epsilon is None: epsilon = Parameter(0.) super(WeightedGaussianPolicy, self).__init__() self._epsilon = epsilon self._evaluation = False
def experiment(): np.random.seed(3) # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) action_space = mdp._mdp_info.action_space observation_space = mdp._mdp_info.observation_space gamma = mdp._mdp_info.gamma # Model Block model_block = MBlock(env=mdp, render=False) #Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) table = Table(mdp.info.size) pi.set_q(table) #Agents mdp_info_agent1 = MDPInfo(observation_space=observation_space, action_space=spaces.Discrete(5), gamma=1, horizon=20) mdp_info_agent2 = MDPInfo(observation_space=spaces.Discrete(5), action_space=action_space, gamma=gamma, horizon=10) agent1 = SimpleAgent(name='HIGH', mdp_info=mdp_info_agent1, policy=pi) agent2 = SimpleAgent(name='LOW', mdp_info=mdp_info_agent2, policy=pi) # Control Blocks control_block1 = ControlBlock(wake_time=10, agent=agent1, n_eps_per_fit=None, n_steps_per_fit=1) control_block2 = ControlBlock(wake_time=1, agent=agent2, n_eps_per_fit=None, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block1, control_block2] order = [0, 1, 2] model_block.add_input(control_block2) control_block1.add_input(model_block) control_block1.add_reward(model_block) control_block2.add_input(control_block1) control_block2.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=40, quiet=True) return
def experiment(): np.random.seed(3) print('hierarchical :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Model Block model_block = MBlock(env=mdp, render=False) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Control Block control_block = ControlBlock(wake_time=1, agent=agent, n_eps_per_fit=None, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block] order = [0, 1] model_block.add_input(control_block) control_block.add_input(model_block) control_block.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=100, quiet=True) return agent.Q.table
def __init__(self, n_approximators, epsilon=None): if epsilon is None: epsilon = Parameter(0.) super(WeightedPolicy, self).__init__() self._n_approximators = n_approximators self._epsilon = epsilon self._evaluation = False
def experiment(alpha): gym.logger.setLevel(0) np.random.seed(386) # MDP mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.) mdp.seed(201) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = {'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.array([[0., 0.], [.1, .1]]) dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, 1.))
def __call__(self, **kwargs): dataset = kwargs.get('dataset') for step in dataset: last = step[-1] if last: self.counter += 1 if self.counter % 50 == 0 and not self.last_counter == self.counter: new_epsilon = Parameter(self._policy._epsilon.get_value() / 1.01) self._policy.set_epsilon(new_epsilon) self.last_counter = self.counter
def __init__(self, n_approximators, epsilon=None): if epsilon is None: epsilon = Parameter(0.) super(BootPolicy, self).__init__() self._n_approximators = n_approximators self._epsilon = epsilon self._evaluation = False self._idx = None self.plotter = None