def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def test_lspi(): mdp = CartPole() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=dict(), approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) w = agent.approximator.get_weights() w_test = np.array([-2.23880597, -2.27427603, -2.25]) assert np.allclose(w, w_test)
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=10000, n_steps_per_fit=1)
def test_collect_Q(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = Parameter(0.1) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(pi, mdp.info, alpha) callback_q = CollectQ(agent.Q) callback_max_q = CollectMaxQ(agent.Q, np.array([2])) core = Core(agent, mdp, callbacks=[callback_q, callback_max_q]) core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True) V_test = np.array([2.4477574 , 0.02246188, 1.6210059 , 6.01867052]) V = callback_q.get()[-1] assert np.allclose(V[0, :], V_test) V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()]) max_q = np.array(callback_max_q.get()) assert np.allclose(V_max, max_q)
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(approximator, pi, mdp.info, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return np.mean(compute_J(dataset, mdp.info.gamma))
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(LinearApproximator, pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0., -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(algorithm_class, decay_exp): np.random.seed() # MDP p = np.load('chain_structure/p.npy') rew = np.load('chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get_values() return Qs
def test_sarsa_lambda_continuous_nn(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(TorchApproximator, pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884, -0.12784956, -0.10572472, -0.14546978, -0.67001086, -0.93925357]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = TrueOnlineSARSALambda(pi, mdp_continuous.info, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0., -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(algorithm_class, decay_exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.approximator, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(boosted): np.random.seed(20) # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Approximator if not boosted: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) else: approximator_params = dict( input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_models=3, prediction='sum', n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True) fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = FQI(approximator, pi, mdp.info, agent_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True) # Test test_epsilon = Parameter(0) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((9, 2)) cont = 0 for i in range(-8, 9, 8): for j in range(-8, 9, 8): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def learn(alg, alg_params): mdp = LQR.generate(dimensions=1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) agent = alg(policy, mdp.info, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return policy
def experiment(policy, value): np.random.seed(45) # MDP mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5)) # Policy pi = policy(Parameter(value=value)) # Agent learning_rate = Parameter(value=.15) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = SARSA(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train n_steps = 2000 core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
def learn(alg): mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy policy_class = OrnsteinUhlenbeckPolicy policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2) # Settings initial_replay_size = 500 max_replay_size = 5000 batch_size = 200 n_features = 80 tau = .001 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = alg(mdp.info, policy_class, policy_params, batch_size, initial_replay_size, max_replay_size, tau, critic_params, actor_params, actor_optimizer) # Algorithm core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return agent.policy
def test_dataset_utils(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) epsilon = Parameter(value=0.) alpha = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) agent = SARSA(pi, mdp.info, alpha) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=10) J = compute_J(dataset, mdp.info.gamma) J_test = np.array([ 1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01, 1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00, 2.28767925e+00, 4.23911583e-01 ]) assert np.allclose(J, J_test) L = episodes_length(dataset) L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31]) assert np.array_equal(L, L_test) dataset_ep = select_first_episodes(dataset, 3) J = compute_J(dataset_ep, mdp.info.gamma) assert np.allclose(J, J_test[:3]) L = episodes_length(dataset_ep) assert np.allclose(L, L_test[:3]) samples = select_random_samples(dataset, 2) s, a, r, ss, ab, last = parse_dataset(samples) s_test = np.array([[6.], [1.]]) a_test = np.array([[0.], [1.]]) r_test = np.zeros(2) ss_test = np.array([[3], [4]]) ab_test = np.zeros(2) last_test = np.zeros(2) assert np.array_equal(s, s_test) assert np.array_equal(a, a_test) assert np.array_equal(r, r_test) assert np.array_equal(ss, ss_test) assert np.array_equal(ab, ab_test) assert np.array_equal(last, last_test) index = np.sum(L_test[:2]) + L_test[2] // 2 min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index], mdp.info.gamma) assert min_J == 0.0 assert max_J == 0.0011610630703530948 assert mean_J == 0.0005805315351765474 assert n_episodes == 2
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps / n_episodes)) print('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def learn(alg, alg_params): # MDP mdp = CartPole() np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict( network=Network if alg is not CategoricalDQN else FeatureNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n, n_features=2, use_cuda=False) # Agent if alg is not CategoricalDQN: agent = alg(TorchApproximator, pi, mdp.info, approximator_params=approximator_params, **alg_params) else: agent = alg(pi, mdp.info, n_atoms=2, v_min=-1, v_max=1, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=500, n_steps_per_fit=5) return agent.approximator
def test_sarsa(): pi, mdp, _ = initialize() agent = SARSA(pi, mdp.info, Parameter(.1)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[4.31368701e-2, 3.68037689e-1, 4.14040445e-2, 1.64007642e-1], [6.45491436e-1, 4.68559000, 8.07603735e-2, 1.67297938e-1], [4.21445838e-2, 3.71538042e-3, 0., 3.439], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_q_learning(): pi, mdp, _ = initialize() agent = QLearning(pi, mdp.info, Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[7.82042542, 8.40151978, 7.64961548, 8.82421875], [8.77587891, 9.921875, 7.29316406, 8.68359375], [7.7203125, 7.69921875, 4.5, 9.84375], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_r_learning(): pi, mdp, _ = initialize() agent = RLearning(pi, mdp.info, Parameter(.1), Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[-6.19137991, -3.9368055, -5.11544257, -3.43673781], [-2.52319391, 1.92201829, -2.77602918, -2.45972955], [-5.38824415, -2.43019918, -1.09965936, 2.04202511], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_weighted_q_learning(): pi, mdp, _ = initialize() agent = WeightedQLearning(pi, mdp.info, Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[7.1592415, 4.07094744, 7.10518702, 8.5467274], [8.08689916, 9.99023438, 5.77871216, 7.51059129], [6.52294537, 0.86087671, 3.70431496, 9.6875], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_expected_sarsa(): pi, mdp, _ = initialize() agent = ExpectedSARSA(pi, mdp.info, Parameter(.1)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.10221208, 0.48411449, 0.07688765, 0.64002317], [0.58525881, 5.217031, 0.06047094, 0.48214145], [0.08478224, 0.28873536, 0.06543094, 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_sarsa_lambda_discrete(): pi, mdp, _ = initialize() agent = SARSALambda(pi, mdp.info, Parameter(.1), .9) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[1.88093529, 2.42467354, 1.07390687, 2.39288988], [2.46058746, 4.68559, 1.5661933, 2.56586018], [1.24808966, 0.91948465, 0.47734152, 3.439], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def learn(alg, alg_params): class Network(nn.Module): def __init__(self, input_shape, output_shape, **kwargs): super(Network, self).__init__() n_input = input_shape[-1] n_output = output_shape[0] self._h = nn.Linear(n_input, n_output) nn.init.xavier_uniform_(self._h.weight, gain=nn.init.calculate_gain('relu')) def forward(self, state, **kwargs): return F.relu(self._h(torch.squeeze(state, 1).float())) mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy_params = dict(std_0=1., use_cuda=False) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, critic_params, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return policy
def test_a2c(): policy_params = dict(std_0=1., n_features=64, use_cuda=False) algorithm_params = dict(actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 3e-3 } }, max_grad_norm=0.5, ent_coeff=0.01) mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = A2C(mdp.info, policy, critic_params, **algorithm_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) w = agent.policy.get_weights() w_test = np.array( [-1.6307759, 1.0356185, -0.34508315, 0.27108294, -0.01047843]) assert np.allclose(w, w_test)
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alpha): gym.logger.setLevel(0) np.random.seed(386) # MDP mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.) mdp.seed(201) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = {'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.array([[0., 0.], [.1, .1]]) dataset = core.evaluate(initial_states=initial_states, quiet=True) return np.mean(compute_J(dataset, 1.))
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Render core.evaluate(n_episodes=1, render=True) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(): np.random.seed() # MDP mdp = InvertedPendulumDiscrete() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=fit_params, approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) return np.mean(episodes_length(dataset))
def test_double_q_learning(): pi, mdp, _ = initialize() agent = DoubleQLearning(pi, mdp.info, Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q_0 = np.array([[2.6578125, 6.94757812, 3.73359375, 7.171875], [2.25, 7.5, 3.0375, 3.375], [3.0375, 5.4140625, 2.08265625, 8.75], [0., 0., 0., 0.]]) test_q_1 = np.array([[2.72109375, 4.5, 4.36640625, 6.609375], [4.5, 9.375, 4.49296875, 4.5], [1.0125, 5.0625, 5.625, 8.75], [0., 0., 0., 0.]]) assert np.allclose(agent.Q[0].table, test_q_0) assert np.allclose(agent.Q[1].table, test_q_1)
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(policy, mdp.info, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def test_collect_parameter(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = ExponentialParameter(value=1, exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(pi, mdp.info, alpha) callback_eps = CollectParameters(eps, 1) core = Core(agent, mdp, callbacks=[callback_eps]) core.learn(n_steps=10, n_steps_per_fit=1, quiet=True) eps_test = np.array([1., 0.70710678, 0.70710678, 0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027]) eps = callback_eps.get() assert np.allclose(eps, eps_test)
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_steps=10000, n_steps_per_fit=1)
def test_copdac_q(): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 1 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) w = agent.policy.get_weights() w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2]) assert np.allclose(w, w_test)
def flat_experiment(mdp, agent, n_epochs, n_iterations, ep_per_iteration, ep_per_eval): np.random.seed() J_list = list() L_list = list() core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_iteration, n_episodes_per_fit=ep_per_iteration, quiet=True) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) #print('J', n, ':', J_list[-1]) return J_list, L_list
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(pi, mdp.info, approximator_params=approximator_params, features=features, **algorithm_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=40, n_steps_per_fit=1, render=False) dataset = core.evaluate(n_episodes=1, render=True) return np.mean(compute_J(dataset, 1.))
def experiment(policy, value): np.random.seed() # MDP mdp = generate_taxi('grid.txt') # Policy pi = policy(Parameter(value=value)) # Agent learning_rate = Parameter(value=.15) algorithm_params = dict(learning_rate=learning_rate) agent = SARSA(pi, mdp.info, **algorithm_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train n_steps = 300000 core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
from mushroom.utils.dataset import compute_J from mushroom.utils.parameters import Parameter mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = FQI(approximator, pi, mdp.info, n_iterations=20, approximator_params=approximator_params) core = Core(agent, mdp) core.learn(n_episodes=1000, n_episodes_per_fit=1000) pi.set_epsilon(Parameter(0.)) initial_state = np.array([[-.5, 0.]]) dataset = core.evaluate(initial_states=initial_state) print(compute_J(dataset, gamma=mdp.info.gamma))
# Q-function approximator n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size,), output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n) # Agent learning_rate = Parameter(.1 / n_tilings) agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info, approximator_params=approximator_params, learning_rate=learning_rate, lambda_coeff= .9, features=features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=100, n_steps_per_fit=1) # Evaluate core.evaluate(n_episodes=1, render=True)
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings-1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate(1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = SAC_AVG(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps/n_episodes)) print('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)
def experiment(): np.random.seed() # Argument parser parser = argparse.ArgumentParser() arg_game = parser.add_argument_group('Game') arg_game.add_argument("--name", type=str, default='BreakoutDeterministic-v4', help='Gym ID of the Atari game.') arg_game.add_argument("--screen-width", type=int, default=84, help='Width of the game screen.') arg_game.add_argument("--screen-height", type=int, default=84, help='Height of the game screen.') arg_mem = parser.add_argument_group('Replay Memory') arg_mem.add_argument("--initial-replay-size", type=int, default=50000, help='Initial size of the replay memory.') arg_mem.add_argument("--max-replay-size", type=int, default=500000, help='Max size of the replay memory.') arg_net = parser.add_argument_group('Deep Q-Network') arg_net.add_argument("--optimizer", choices=['adadelta', 'adam', 'rmsprop', 'rmspropcentered'], default='adam', help='Name of the optimizer to use.') arg_net.add_argument("--learning-rate", type=float, default=.00025, help='Learning rate value of the optimizer.') arg_net.add_argument("--decay", type=float, default=.95, help='Discount factor for the history coming from the' 'gradient momentum in rmspropcentered and' 'rmsprop') arg_net.add_argument("--epsilon", type=float, default=.01, help='Epsilon term used in rmspropcentered and' 'rmsprop') arg_alg = parser.add_argument_group('Algorithm') arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'], default='dqn', help='Name of the algorithm. dqn is for standard' 'DQN, ddqn is for Double DQN and adqn is for' 'Averaged DQN.') arg_alg.add_argument("--n-approximators", type=int, default=1, help="Number of approximators used in the ensemble for" "Averaged DQN.") arg_alg.add_argument("--batch-size", type=int, default=32, help='Batch size for each fit of the network.') arg_alg.add_argument("--history-length", type=int, default=4, help='Number of frames composing a state.') arg_alg.add_argument("--target-update-frequency", type=int, default=10000, help='Number of collected samples before each update' 'of the target network.') arg_alg.add_argument("--evaluation-frequency", type=int, default=250000, help='Number of collected samples before each' 'evaluation. An epoch ends after this number of' 'steps') arg_alg.add_argument("--train-frequency", type=int, default=4, help='Number of collected samples before each fit of' 'the neural network.') arg_alg.add_argument("--max-steps", type=int, default=50000000, help='Total number of collected samples.') arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000, help='Number of collected samples until the exploration' 'rate stops decreasing.') arg_alg.add_argument("--initial-exploration-rate", type=float, default=1., help='Initial value of the exploration rate.') arg_alg.add_argument("--final-exploration-rate", type=float, default=.1, help='Final value of the exploration rate. When it' 'reaches this values, it stays constant.') arg_alg.add_argument("--test-exploration-rate", type=float, default=.05, help='Exploration rate used during evaluation.') arg_alg.add_argument("--test-samples", type=int, default=125000, help='Number of collected samples for each' 'evaluation.') arg_alg.add_argument("--max-no-op-actions", type=int, default=8, help='Maximum number of no-op actions performed at the' 'beginning of the episodes. The minimum number is' 'history_length. This number is reported to be 30' 'in the DQN Deepmind paper but, since they' 'consider the first 30 frames without frame' 'skipping and that the number of skipped frames' 'is generally 4, we set it to 8.') arg_alg.add_argument("--no-op-action-value", type=int, default=0, help='Value of the no-op action.') arg_utils = parser.add_argument_group('Utils') arg_utils.add_argument('--device', type=int, default=None, help='ID of the GPU device to use. If None, CPU is' 'used.') arg_utils.add_argument('--load-path', type=str, help='Path of the model to be loaded.') arg_utils.add_argument('--save', action='store_true', help='Flag specifying whether to save the model.') arg_utils.add_argument('--render', action='store_true', help='Flag specifying whether to render the game.') arg_utils.add_argument('--quiet', action='store_true', help='Flag specifying whether to hide the progress' 'bar.') arg_utils.add_argument('--debug', action='store_true', help='Flag specifying whether the script has to be' 'run in debug mode.') args = parser.parse_args() scores = list() optimizer = dict() if args.optimizer == 'adam': optimizer['class'] = optim.Adam optimizer['params'] = dict(lr=args.learning_rate) elif args.optimizer == 'adadelta': optimizer['class'] = optim.Adadelta optimizer['params'] = dict(lr=args.learning_rate) elif args.optimizer == 'rmsprop': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon) elif args.optimizer == 'rmspropcentered': optimizer['class'] = optim.RMSprop optimizer['params'] = dict(lr=args.learning_rate, alpha=args.decay, eps=args.epsilon, centered=True) else: raise ValueError # Evaluation of the model provided by the user. if args.load_path: # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=False) # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedy(epsilon=epsilon_test) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( network=Network, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, load_path=args.load_path, optimizer=optimizer, loss=F.smooth_l1_loss, device=args.device ) approximator = PyTorchApproximator # Agent algorithm_params = dict( batch_size=1, train_frequency=1, target_update_frequency=1, initial_replay_size=0, max_replay_size=0, history_length=args.history_length, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run # Summary folder folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\ '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') pathlib.Path(folder_name).mkdir(parents=True) # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP mdp = Atari(args.name, args.screen_width, args.screen_height, ends_at_life=True) # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = (args.screen_height, args.screen_width, args.history_length) approximator_params = dict( network=Network, input_shape=input_shape, output_shape=(mdp.info.action_space.n,), n_actions=mdp.info.action_space.n, folder_name=folder_name, optimizer=optimizer, loss=F.smooth_l1_loss, device=args.device ) approximator = PyTorchApproximator # Agent algorithm_params = dict( batch_size=args.batch_size, n_approximators=args.n_approximators, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=args.history_length, target_update_frequency=target_update_frequency//train_frequency, max_no_op_actions=args.max_no_op_actions, no_op_action_value=args.no_op_action_value, dtype=np.uint8 ) if args.algorithm == 'dqn': agent = DQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'ddqn': agent = DoubleDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) elif args.algorithm == 'adqn': agent = AveragedDQN(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() # Evaluate initial policy pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) if args.algorithm == 'ddqn': agent.policy.set_q(agent.target_approximator) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.algorithm == 'ddqn': agent.policy.set_q(agent.approximator) np.save(folder_name + '/scores.npy', scores) return scores
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = (1,) + mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(PyTorchApproximator, pi, mdp.info, approximator_params=approximator_params, batch_size=batch_size, n_approximators=1, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, history_length=1, target_update_frequency=target_update_frequency, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in range(n_epochs): print('Epoch: ', n) pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)