def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Core core = Core(agent, mdp) # Initial policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J start:', J) # Train core.learn(n_steps=10000, n_steps_per_fit=1) # Final Policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J final:', J)
def experiment(algorithm_class, decay_exp): np.random.seed() # MDP p = np.load('chain_structure/p.npy') rew = np.load('chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get_values() return Qs
def test_collect_Q(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = Parameter(0.1) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(pi, mdp.info, alpha) callback_q = CollectQ(agent.Q) callback_max_q = CollectMaxQ(agent.Q, np.array([2])) core = Core(agent, mdp, callbacks=[callback_q, callback_max_q]) core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True) V_test = np.array([2.4477574 , 0.02246188, 1.6210059 , 6.01867052]) V = callback_q.get()[-1] assert np.allclose(V[0, :], V_test) V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()]) max_q = np.array(callback_max_q.get()) assert np.allclose(V_max, max_q)
def experiment_others(alg, decay_exp): np.random.seed() # MDP grid_map = "simple_gridmap.txt" mdp = GridWorldGenerator(grid_map=grid_map) # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=alpha) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = alg(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment2(): np.random.seed(3) print('mushroom :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() return agent.Q.table
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=10000, n_steps_per_fit=1)
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(approximator, pi, mdp.info, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return np.mean(compute_J(dataset, mdp.info.gamma))
def test_lspi(): mdp = CartPole() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=dict(), approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) w = agent.approximator.get_weights() w_test = np.array([-2.23880597, -2.27427603, -2.25]) assert np.allclose(w, w_test)
def experiment(boosted): np.random.seed(20) # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Approximator if not boosted: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) else: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_models=3, prediction='sum', n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = FQI(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True) # Test test_epsilon = Parameter(0) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((9, 2)) cont = 0 for i in range(-8, 9, 8): for j in range(-8, 9, 8): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(algorithm_class, decay_exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.approximator, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def build_high_level_agent(alg, params, mdp, epsilon): pi = EpsGreedy(epsilon=epsilon, ) mdp_info_high = MDPInfo(observation_space=spaces.Discrete(16), action_space=spaces.Discrete(4), gamma=mdp.info.gamma, horizon=100) agent = alg(pi, mdp_info_high, **params) return agent
def test_dataset_utils(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) epsilon = Parameter(value=0.) alpha = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) agent = SARSA(pi, mdp.info, alpha) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=10) J = compute_J(dataset, mdp.info.gamma) J_test = np.array([ 1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01, 1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00, 2.28767925e+00, 4.23911583e-01 ]) assert np.allclose(J, J_test) L = episodes_length(dataset) L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31]) assert np.array_equal(L, L_test) dataset_ep = select_first_episodes(dataset, 3) J = compute_J(dataset_ep, mdp.info.gamma) assert np.allclose(J, J_test[:3]) L = episodes_length(dataset_ep) assert np.allclose(L, L_test[:3]) samples = select_random_samples(dataset, 2) s, a, r, ss, ab, last = parse_dataset(samples) s_test = np.array([[6.], [1.]]) a_test = np.array([[0.], [1.]]) r_test = np.zeros(2) ss_test = np.array([[3], [4]]) ab_test = np.zeros(2) last_test = np.zeros(2) assert np.array_equal(s, s_test) assert np.array_equal(a, a_test) assert np.array_equal(r, r_test) assert np.array_equal(ss, ss_test) assert np.array_equal(ab, ab_test) assert np.array_equal(last, last_test) index = np.sum(L_test[:2]) + L_test[2] // 2 min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index], mdp.info.gamma) assert min_J == 0.0 assert max_J == 0.0011610630703530948 assert mean_J == 0.0005805315351765474 assert n_episodes == 2
def build_high_level_agent(alg, params, mdp): epsilon = Parameter(value=0.1) pi = EpsGreedy(epsilon=epsilon) gamma = 1.0 mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(400), action_space=spaces.Discrete(8), gamma=gamma, horizon=10000) agent = alg(policy=pi, mdp_info=mdp_info_agentH, **params) return agent
def learn(alg, alg_params): # MDP mdp = CartPole() np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict( network=Network if alg is not CategoricalDQN else FeatureNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n, n_features=2, use_cuda=False) # Agent if alg is not CategoricalDQN: agent = alg(TorchApproximator, pi, mdp.info, approximator_params=approximator_params, **alg_params) else: agent = alg(pi, mdp.info, n_atoms=2, v_min=-1, v_max=1, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=500, n_steps_per_fit=5) return agent.approximator
def experiment1(decay_exp, beta_type): np.random.seed() # MDP p = np.load('p.npy') rew = np.load('rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if beta_type == 'Win': beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=10., window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=10.) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_q = CollectQ(agent.Q) collect_lr_1 = CollectParameters(beta, np.array([0])) collect_lr_5 = CollectParameters(beta, np.array([4])) callbacks = [collect_q, collect_lr_1, collect_lr_5] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_q.get_values() lr_1 = collect_lr_1.get_values() lr_5 = collect_lr_5.get_values() return Qs, lr_1, lr_5
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Render core.evaluate(n_episodes=1, render=True) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(decay_exp, windowed, tol): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if windowed: beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol, window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(): np.random.seed() # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=fit_params, approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset))
def experiment(): np.random.seed(3) print('hierarchical :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Model Block model_block = MBlock(env=mdp, render=False) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Control Block control_block = ControlBlock(wake_time=1, agent=agent, n_eps_per_fit=None, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block] order = [0, 1] model_block.add_input(control_block) control_block.add_input(model_block) control_block.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=100, quiet=True) return agent.Q.table
def experiment(): np.random.seed(3) print('hierarchical :') mdp = GridWorldVanHasselt() # Model Block model_block = MBlock(env=mdp, render=False) # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=1., size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Control Block control_block = ControlBlock(name='controller', agent=agent, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block] order = [0, 1] model_block.add_input(control_block) control_block.add_input(model_block) control_block.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=2000, quiet=True) return agent.Q.table
def experiment(alpha): gym.logger.setLevel(0) np.random.seed(386) # MDP mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.) mdp.seed(201) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = {'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.array([[0., 0.], [.1, .1]]) dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, 1.))
def test_collect_parameter(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = ExponentialParameter(value=1, exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(pi, mdp.info, alpha) callback_eps = CollectParameters(eps, 1) core = Core(agent, mdp, callbacks=[callback_eps]) core.learn(n_steps=10, n_steps_per_fit=1, quiet=True) eps_test = np.array([1., 0.70710678, 0.70710678, 0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027]) eps = callback_eps.get() assert np.allclose(eps, eps_test)
def experiment(): np.random.seed() # MDP mdp = InvertedPendulum() # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent rbfs = GaussianRBF.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(basis_list=rbfs) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = dict(n_iterations) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = LSPI(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=1000, n_episodes_per_fit=20) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=20) return np.mean(compute_J(dataset, 1.))
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=20, n_steps_per_fit=1, render=0) dataset = collect_dataset.get() return np.mean(compute_J(dataset, 1.))
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(pi, mdp.info, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.))
def test_collect_dataset(): np.random.seed(88) callback = CollectDataset() mdp = GridWorld(4, 4, (2, 2)) eps = Parameter(0.1) pi = EpsGreedy(eps) alpha = Parameter(0.2) agent = SARSA(pi, mdp.info, alpha) core = Core(agent, mdp, callbacks=[callback]) core.learn(n_steps=10, n_steps_per_fit=1, quiet=True) dataset = callback.get() assert len(dataset) == 10 core.learn(n_steps=5, n_steps_per_fit=1, quiet=True) assert len(dataset) == 15 callback.clean() dataset = callback.get() assert len(dataset) == 0
def experiment2(): np.random.seed(3) print('mushroom :') mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=1., size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Train dataset = collect_dataset.get() VisualizeControlBlock(dataset) return agent.Q.table
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use to learn.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer. Only used' 'in rmspropcentered') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of learning step before each update of' 'the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of learning step before each evaluation.' 'This number represents an epoch.') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of learning steps before each fit of the' 'neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of learning steps.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of steps until the exploration rate stops' 'decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of steps for each evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=8, help='Maximum number of no-op action performed at the' 'beginning of the episodes. The minimum number is' 'history_length. This number is 30 in the DQN' 'Deepmind paper, but they consider the first 30' 'frame without frame skipping.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, name='test', load_path=args.load_path, optimizer={'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon} ) approximator = ConvNet # Agent algorithm_params = dict( batch_size=1, train_frequency=1, target_update_frequency=1, initial_replay_size=0, max_replay_size=0, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, folder_name=folder_name, optimizer={'name': args.optimizer, 'lr': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon} ) approximator = ConvNet # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, train_frequency=train_frequency, target_update_frequency=target_update_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) return scores
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # FeaturesH low_hi = 0 lim_hi = 1000 + 1e-8 n_tiles_high = [20, 20] n_tilings = 1 # Discretization Block discretization_block = DiscretizationBlock(low=low_hi, high=lim_hi, n_tiles=n_tiles_high) # PolicyH epsilon = Parameter(value=0.1) piH = EpsGreedy(epsilon=epsilon) # AgentH learning_rate = params.get('learning_rate_high') mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete( n_tiles_high[0] * n_tiles_high[1]), action_space=spaces.Discrete(8), gamma=1, horizon=10000) agentH = alg_high(policy=piH, mdp_info=mdp_info_agentH, learning_rate=learning_rate, lambda_coeff=0.9) epsilon_update = EpsilonUpdate(piH) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agentH, n_steps_per_fit=1) #FeaturesL high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 10] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 3 tilingsL = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) featuresL = Features(tilings=tilingsL) mdp_info_agentL = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )), action_space=mdp.info.action_space, gamma=0.99, horizon=10000) # Approximators input_shape = (featuresL.size, ) approximator_params = dict(input_dim=input_shape[0]) approximator1 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) approximator2 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) # Policy1 std1 = np.array([3e-2]) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1) # Policy2 std2 = np.array([3e-2]) pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2) # Agent1 learning_rate1 = params.get('learning_rate_low') agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL) # Agent2 learning_rate2 = params.get('learning_rate_low') agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL) #Termination Conds termination_condition1 = TerminationCondition(active_dir='+') termination_condition2 = TerminationCondition(active_dir='x') low_ep_per_fit = params.get('low_ep_per_fit') # Control Block + control_block_plus = ControlBlock( name='control block 1', agent=agent1, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition1) # Control Block x control_block_cross = ControlBlock( name='control block 2', agent=agent2, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate, name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low, name='f7 G_lo') #Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma, name='reward_acc_H') # Selector Block function_block8 = fBlock(phi=selector_function, name='f7 G_lo') #Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block_plus]) mux_block.add_block_list([control_block_cross]) #Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, function_block8, reward_acc_H, discretization_block ] reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block_plus) reward_acc_H.add_alarm_connection(control_block_cross) control_blockH.add_input(discretization_block) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block_plus) control_blockH.add_alarm_connection(control_block_cross) mux_block.add_input(function_block8) mux_block.add_input(function_block2) control_block_plus.add_reward(function_block5) control_block_cross.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block_plus) function_block3.add_alarm_connection(control_block_cross) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) function_block8.add_input(control_blockH) discretization_block.add_input(function_block1) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval1 = list() low_level_dataset_eval2 = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) # print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) dataset_eval += dataset_eval_run print('J at start : ' + str(np.mean(J))) for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run dataset_plus = control_block_plus.dataset.get() J_plus = compute_J(dataset_plus, mdp.info.gamma) dataset_cross = control_block_cross.dataset.get() J_cross = compute_J(dataset_cross, mdp.info.gamma) low_level_dataset_eval1.append(dataset_plus) low_level_dataset_eval2.append(dataset_cross) print('J ll PLUS at iteration ' + str(n) + ': ' + str(np.mean(J_plus))) print('J ll CROSS at iteration ' + str(n) + ': ' + str(np.mean(J_cross))) if n == 4: control_blockH.callbacks = [epsilon_update] # Tile data hi_lev_params = agentH.Q.table max_q_val = np.zeros(n_tiles_high[0]**2) act_max_q_val = np.zeros(n_tiles_high[0]**2) for n in range(n_tiles_high[0]**2): max_q_val[n] = np.amax(hi_lev_params[n]) act_max_q_val[n] = np.argmax(hi_lev_params[n]) mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset1_file', low_level_dataset_eval1) np.save(subdir + str(i) + '/low_level_dataset2_file', low_level_dataset_eval2) np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val) np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval) return
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(PyTorchApproximator, pi, mdp.info, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, target_update_frequency=target_update_frequency) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of collected samples.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of collected samples for each' 'evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=8, help='Maximum number of no-op actions performed at the' 'beginning of the episodes. The minimum number is' 'history_length. This number is reported to be 30' 'in the DQN Deepmind paper but, since they' 'consider the first 30 frames without frame' 'skipping and that the number of skipped frames' 'is generally 4, we set it to 8.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--device', type=int, default=None, help='ID of the GPU device to use. If None, CPU is' 'used.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( network=Network, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, load_path=args.load_path, optimizer=optimizer, loss=F.smooth_l1_loss, device=args.device ) approximator = PyTorchApproximator # Agent algorithm_params = dict( batch_size=1, train_frequency=1, target_update_frequency=1, initial_replay_size=0, max_replay_size=0, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( network=Network, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, folder_name=folder_name, optimizer=optimizer, loss=F.smooth_l1_loss, device=args.device ) approximator = PyTorchApproximator # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, target_update_frequency=target_update_frequency//train_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = (1,) + mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(PyTorchApproximator, pi, mdp.info, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=1, target_update_frequency=target_update_frequency, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
import numpy as np from sklearn.ensemble import ExtraTreesRegressor from mushroom.algorithms.value import FQI from mushroom.core import Core from mushroom.environments import CarOnHill from mushroom.policy import EpsGreedy from mushroom.utils.dataset import compute_J from mushroom.utils.parameters import Parameter mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = FQI(approximator, pi, mdp.info, n_iterations=20, approximator_params=approximator_params) core = Core(agent, mdp) core.learn(n_episodes=1000, n_episodes_per_fit=1000)