Ejemplo n.º 1
0
def experiment(alg, params, n_epochs, fit_per_run, ep_per_run):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 1e-3 * np.eye(policy.weights_size)
    distribution = GaussianCholeskyDistribution(mu, sigma)

    # Agent
    agent = alg(distribution, policy, mdp.info, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=fit_per_run * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        print('distribution parameters: ', distribution.get_parameters())
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
Ejemplo n.º 2
0
def test_lspi():
    mdp = CartPole()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=dict(),
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    w = agent.approximator.get_weights()
    w_test = np.array([-2.23880597, -2.27427603, -2.25])

    assert np.allclose(w, w_test)
Ejemplo n.º 3
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon, )

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = QLearning(pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)
Ejemplo n.º 4
0
def test_collect_Q():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = Parameter(0.1)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(pi, mdp.info, alpha)

    callback_q = CollectQ(agent.Q)
    callback_max_q = CollectMaxQ(agent.Q, np.array([2]))

    core = Core(agent, mdp, callbacks=[callback_q, callback_max_q])

    core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True)

    V_test = np.array([2.4477574 , 0.02246188, 1.6210059 , 6.01867052])
    V = callback_q.get()[-1]

    assert np.allclose(V[0, :], V_test)

    V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()])
    max_q = np.array(callback_max_q.get())

    assert np.allclose(V_max, max_q)
Ejemplo n.º 5
0
def learn(alg, alg_params):
    mdp = CarOnHill()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    agent = alg(approximator, pi, mdp.info,
                approximator_params=approximator_params, **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=5, n_episodes_per_fit=5)

    test_epsilon = Parameter(0.75)
    agent.policy.set_epsilon(test_epsilon)
    dataset = core.evaluate(n_episodes=2)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Ejemplo n.º 6
0
def test_sarsa_lambda_continuous_linear():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(LinearApproximator, pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0.,
                       -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
Ejemplo n.º 7
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
Ejemplo n.º 8
0
def test_sarsa_lambda_continuous_nn():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)

    features = Features(
        n_outputs=mdp_continuous.info.observation_space.shape[0]
    )

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        network=Network,
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(TorchApproximator, pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884,
                       -0.12784956, -0.10572472, -0.14546978, -0.67001086,
                       -0.93925357])

    assert np.allclose(agent.Q.get_weights(), test_w)
Ejemplo n.º 9
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1.,
                                              decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get_values()

    return Qs
Ejemplo n.º 10
0
def test_true_online_sarsa_lambda():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = TrueOnlineSARSALambda(pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0.,
                       -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
Ejemplo n.º 11
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.approximator, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
Ejemplo n.º 12
0
def experiment(boosted):
    np.random.seed(20)

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    if not boosted:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)
    else:
        approximator_params = dict(
            input_shape=mdp.info.observation_space.shape,
            n_actions=mdp.info.action_space.n,
            n_models=3,
            prediction='sum',
            n_estimators=50,
            min_samples_split=5,
            min_samples_leaf=2)

    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=3, boosted=boosted, quiet=True)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = FQI(approximator, pi, mdp.info, agent_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=50, n_episodes_per_fit=50, quiet=True)

    # Test
    test_epsilon = Parameter(0)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((9, 2))
    cont = 0
    for i in range(-8, 9, 8):
        for j in range(-8, 9, 8):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Ejemplo n.º 13
0
def learn(alg, alg_params):
    mdp = LQR.generate(dimensions=1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    agent = alg(policy, mdp.info, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return policy
Ejemplo n.º 14
0
def experiment(policy, value):
    np.random.seed(45)

    # MDP
    mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5))

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = SARSA(pi, mdp.info, agent_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 2000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Ejemplo n.º 15
0
def learn(alg):
    mdp = Gym('Pendulum-v0', 200, .99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Policy
    policy_class = OrnsteinUhlenbeckPolicy
    policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    batch_size = 200
    n_features = 80
    tau = .001

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_params = dict(network=ActorNetwork,
                        n_features=n_features,
                        input_shape=actor_input_shape,
                        output_shape=mdp.info.action_space.shape,
                        use_cuda=False)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': .001
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=False)

    # Agent
    agent = alg(mdp.info, policy_class, policy_params, batch_size,
                initial_replay_size, max_replay_size, tau, critic_params,
                actor_params, actor_optimizer)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return agent.policy
Ejemplo n.º 16
0
def test_dataset_utils():
    np.random.seed(88)

    mdp = GridWorld(3, 3, (2, 2))
    epsilon = Parameter(value=0.)
    alpha = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    agent = SARSA(pi, mdp.info, alpha)
    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=10)

    J = compute_J(dataset, mdp.info.gamma)
    J_test = np.array([
        1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01,
        1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00,
        2.28767925e+00, 4.23911583e-01
    ])
    assert np.allclose(J, J_test)

    L = episodes_length(dataset)
    L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31])
    assert np.array_equal(L, L_test)

    dataset_ep = select_first_episodes(dataset, 3)
    J = compute_J(dataset_ep, mdp.info.gamma)
    assert np.allclose(J, J_test[:3])

    L = episodes_length(dataset_ep)
    assert np.allclose(L, L_test[:3])

    samples = select_random_samples(dataset, 2)
    s, a, r, ss, ab, last = parse_dataset(samples)
    s_test = np.array([[6.], [1.]])
    a_test = np.array([[0.], [1.]])
    r_test = np.zeros(2)
    ss_test = np.array([[3], [4]])
    ab_test = np.zeros(2)
    last_test = np.zeros(2)
    assert np.array_equal(s, s_test)
    assert np.array_equal(a, a_test)
    assert np.array_equal(r, r_test)
    assert np.array_equal(ss, ss_test)
    assert np.array_equal(ab, ab_test)
    assert np.array_equal(last, last_test)

    index = np.sum(L_test[:2]) + L_test[2] // 2
    min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index],
                                                       mdp.info.gamma)
    assert min_J == 0.0
    assert max_J == 0.0011610630703530948
    assert mean_J == 0.0005805315351765474
    assert n_episodes == 2
Ejemplo n.º 17
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high,
                                     phi, phi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps / n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
Ejemplo n.º 18
0
def learn(alg, alg_params):
    # MDP
    mdp = CartPole()
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Policy
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(
        network=Network if alg is not CategoricalDQN else FeatureNetwork,
        optimizer={
            'class': optim.Adam,
            'params': {
                'lr': .001
            }
        },
        loss=F.smooth_l1_loss,
        input_shape=input_shape,
        output_shape=mdp.info.action_space.size,
        n_actions=mdp.info.action_space.n,
        n_features=2,
        use_cuda=False)

    # Agent
    if alg is not CategoricalDQN:
        agent = alg(TorchApproximator,
                    pi,
                    mdp.info,
                    approximator_params=approximator_params,
                    **alg_params)
    else:
        agent = alg(pi,
                    mdp.info,
                    n_atoms=2,
                    v_min=-1,
                    v_max=1,
                    approximator_params=approximator_params,
                    **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=500, n_steps_per_fit=5)

    return agent.approximator
Ejemplo n.º 19
0
def test_sarsa():
    pi, mdp, _ = initialize()
    agent = SARSA(pi, mdp.info, Parameter(.1))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[4.31368701e-2, 3.68037689e-1, 4.14040445e-2, 1.64007642e-1],
                       [6.45491436e-1, 4.68559000, 8.07603735e-2, 1.67297938e-1],
                       [4.21445838e-2, 3.71538042e-3, 0., 3.439],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Ejemplo n.º 20
0
def test_q_learning():
    pi, mdp, _ = initialize()
    agent = QLearning(pi, mdp.info, Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[7.82042542, 8.40151978, 7.64961548, 8.82421875],
                       [8.77587891, 9.921875, 7.29316406, 8.68359375],
                       [7.7203125, 7.69921875, 4.5, 9.84375],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Ejemplo n.º 21
0
def test_r_learning():
    pi, mdp, _ = initialize()
    agent = RLearning(pi, mdp.info, Parameter(.1), Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[-6.19137991, -3.9368055, -5.11544257, -3.43673781],
                       [-2.52319391, 1.92201829, -2.77602918, -2.45972955],
                       [-5.38824415, -2.43019918, -1.09965936, 2.04202511],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Ejemplo n.º 22
0
def test_weighted_q_learning():
    pi, mdp, _ = initialize()
    agent = WeightedQLearning(pi, mdp.info, Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[7.1592415, 4.07094744, 7.10518702, 8.5467274],
                       [8.08689916, 9.99023438, 5.77871216, 7.51059129],
                       [6.52294537, 0.86087671, 3.70431496, 9.6875],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Ejemplo n.º 23
0
def test_expected_sarsa():
    pi, mdp, _ = initialize()
    agent = ExpectedSARSA(pi, mdp.info, Parameter(.1))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[0.10221208, 0.48411449, 0.07688765, 0.64002317],
                       [0.58525881, 5.217031, 0.06047094, 0.48214145],
                       [0.08478224, 0.28873536, 0.06543094, 4.68559],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Ejemplo n.º 24
0
def test_sarsa_lambda_discrete():
    pi, mdp, _ = initialize()
    agent = SARSALambda(pi, mdp.info, Parameter(.1), .9)

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[1.88093529, 2.42467354, 1.07390687, 2.39288988],
                       [2.46058746, 4.68559, 1.5661933, 2.56586018],
                       [1.24808966, 0.91948465, 0.47734152, 3.439],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Ejemplo n.º 25
0
def learn(alg, alg_params):
    class Network(nn.Module):
        def __init__(self, input_shape, output_shape, **kwargs):
            super(Network, self).__init__()

            n_input = input_shape[-1]
            n_output = output_shape[0]

            self._h = nn.Linear(n_input, n_output)

            nn.init.xavier_uniform_(self._h.weight,
                                    gain=nn.init.calculate_gain('relu'))

        def forward(self, state, **kwargs):
            return F.relu(self._h(torch.squeeze(state, 1).float()))

    mdp = Gym('Pendulum-v0', 200, .99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    policy_params = dict(std_0=1., use_cuda=False)

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = alg(mdp.info, policy, critic_params, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return policy
Ejemplo n.º 26
0
def test_a2c():
    policy_params = dict(std_0=1., n_features=64, use_cuda=False)

    algorithm_params = dict(actor_optimizer={
        'class': optim.RMSprop,
        'params': {
            'lr': 7e-4,
            'eps': 3e-3
        }
    },
                            max_grad_norm=0.5,
                            ent_coeff=0.01)

    mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 7e-4,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = A2C(mdp.info, policy, critic_params, **algorithm_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    w = agent.policy.get_weights()
    w_test = np.array(
        [-1.6307759, 1.0356185, -0.34508315, 0.27108294, -0.01047843])

    assert np.allclose(w, w_test)
Ejemplo n.º 27
0
def experiment(alg, params, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size,)

    approximator = Regressor(LinearApproximator, input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(distribution, policy, mdp.info, features=phi, **params)

    # Train
    print(alg.__name__)
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
Ejemplo n.º 28
0
def experiment(alpha):
    gym.logger.setLevel(0)
    np.random.seed(386)

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.)
    mdp.seed(201)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size,),
                               output_shape=(mdp.info.action_space.n,),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate,
                        'lambda': .9}
    fit_params = dict()
    agent_params = {'approximator_params': approximator_params,
                    'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.array([[0., 0.], [.1, .1]])
    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, 1.))
Ejemplo n.º 29
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=20)
    agent = FQI(approximator, pi, mdp.info,
                approximator_params=approximator_params, **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Render
    core.evaluate(n_episodes=1, render=True)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=1000)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((289, 2))
    cont = 0
    for i in range(-8, 9):
        for j in range(-8, 9):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states)

    # Render
    core.evaluate(n_episodes=3, render=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Ejemplo n.º 30
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=20)
    agent = FQI(approximator, pi, mdp.info,
                approximator_params=approximator_params, **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=1000)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((289, 2))
    cont = 0
    for i in range(-8, 9):
        for j in range(-8, 9):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states)

    return np.mean(compute_J(dataset, mdp.info.gamma))
Ejemplo n.º 31
0
def experiment():
    np.random.seed()

    # MDP
    mdp = InvertedPendulumDiscrete()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=fit_params,
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    return np.mean(episodes_length(dataset))
Ejemplo n.º 32
0
def test_double_q_learning():
    pi, mdp, _ = initialize()
    agent = DoubleQLearning(pi, mdp.info, Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q_0 = np.array([[2.6578125, 6.94757812, 3.73359375, 7.171875],
                         [2.25, 7.5, 3.0375, 3.375],
                         [3.0375, 5.4140625, 2.08265625, 8.75],
                         [0., 0., 0., 0.]])
    test_q_1 = np.array([[2.72109375, 4.5, 4.36640625, 6.609375],
                         [4.5, 9.375, 4.49296875, 4.5],
                         [1.0125, 5.0625, 5.625, 8.75],
                         [0., 0., 0., 0.]])

    assert np.allclose(agent.Q[0].table, test_q_0)
    assert np.allclose(agent.Q[1].table, test_q_1)
Ejemplo n.º 33
0
def experiment(alg, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = alg(policy, mdp.info, **algorithm_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    print('policy parameters: ', policy.get_weights())
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        print('policy parameters: ', policy.get_weights())
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
Ejemplo n.º 34
0
def test_collect_parameter():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = ExponentialParameter(value=1, exp=.5,
                                   size=mdp.info.observation_space.size)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(pi, mdp.info, alpha)

    callback_eps = CollectParameters(eps, 1)

    core = Core(agent, mdp, callbacks=[callback_eps])

    core.learn(n_steps=10, n_steps_per_fit=1, quiet=True)

    eps_test = np.array([1., 0.70710678, 0.70710678, 0.57735027, 0.57735027,
                         0.57735027, 0.57735027, 0.57735027, 0.57735027, 0.57735027])
    eps = callback_eps.get()

    assert np.allclose(eps, eps_test)
Ejemplo n.º 35
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(pi, mdp.info, **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)
Ejemplo n.º 36
0
def test_copdac_q():
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 1
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    w = agent.policy.get_weights()
    w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2])

    assert np.allclose(w, w_test)
Ejemplo n.º 37
0
def flat_experiment(mdp, agent, n_epochs, n_iterations,
                    ep_per_iteration, ep_per_eval):
    np.random.seed()

    J_list = list()
    L_list = list()
    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    J_list.append(np.mean(J))
    L = episodes_length(dataset)
    L_list.append(np.mean(L))

    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_iteration,
                   n_episodes_per_fit=ep_per_iteration, quiet=True)
        dataset = core.evaluate(n_episodes=ep_per_eval, quiet=True)
        J = compute_J(dataset, gamma=mdp.info.gamma)
        J_list.append(np.mean(J))
        L = episodes_length(dataset)
        L_list.append(np.mean(L))
        #print('J', n, ':', J_list[-1])

    return J_list, L_list
Ejemplo n.º 38
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size,),
                               output_shape=(mdp.info.action_space.n,),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate,
                        'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(pi, mdp.info,
                                  approximator_params=approximator_params,
                                  features=features, **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=40, n_steps_per_fit=1, render=False)
    dataset = core.evaluate(n_episodes=1, render=True)

    return np.mean(compute_J(dataset, 1.))
Ejemplo n.º 39
0
def experiment(policy, value):
    np.random.seed()

    # MDP
    mdp = generate_taxi('grid.txt')

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = SARSA(pi, mdp.info, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 300000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Ejemplo n.º 40
0
from mushroom.utils.dataset import compute_J
from mushroom.utils.parameters import Parameter

mdp = CarOnHill()

# Policy
epsilon = Parameter(value=1.)
pi = EpsGreedy(epsilon=epsilon)

# Approximator
approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                           n_actions=mdp.info.action_space.n,
                           n_estimators=50,
                           min_samples_split=5,
                           min_samples_leaf=2)
approximator = ExtraTreesRegressor

# Agent
agent = FQI(approximator, pi, mdp.info, n_iterations=20,
            approximator_params=approximator_params)

core = Core(agent, mdp)

core.learn(n_episodes=1000, n_episodes_per_fit=1000)

pi.set_epsilon(Parameter(0.))
initial_state = np.array([[-.5, 0.]])
dataset = core.evaluate(initial_states=initial_state)

print(compute_J(dataset, gamma=mdp.info.gamma))
Ejemplo n.º 41
0
# Q-function approximator
n_tilings = 10
tilings = Tiles.generate(n_tilings, [10, 10],
                         mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

approximator_params = dict(input_shape=(features.size,),
                           output_shape=(mdp.info.action_space.n,),
                           n_actions=mdp.info.action_space.n)

# Agent
learning_rate = Parameter(.1 / n_tilings)

agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info,
                              approximator_params=approximator_params,
                              learning_rate=learning_rate,
                              lambda_coeff= .9, features=features)

# Algorithm
collect_dataset = CollectDataset()
callbacks = [collect_dataset]
core = Core(agent, mdp, callbacks=callbacks)

# Train
core.learn(n_episodes=100, n_steps_per_fit=1)

# Evaluate
core.evaluate(n_episodes=1, render=True)
Ejemplo n.º 42
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings-1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator, input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = SAC_AVG(policy, mdp.info,
                    alpha_theta, alpha_v, alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high,
                               phi, psi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps/n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
Ejemplo n.º 43
0
def experiment():
    np.random.seed()

    # Argument parser
    parser = argparse.ArgumentParser()

    arg_game = parser.add_argument_group('Game')
    arg_game.add_argument("--name",
                          type=str,
                          default='BreakoutDeterministic-v4',
                          help='Gym ID of the Atari game.')
    arg_game.add_argument("--screen-width", type=int, default=84,
                          help='Width of the game screen.')
    arg_game.add_argument("--screen-height", type=int, default=84,
                          help='Height of the game screen.')

    arg_mem = parser.add_argument_group('Replay Memory')
    arg_mem.add_argument("--initial-replay-size", type=int, default=50000,
                         help='Initial size of the replay memory.')
    arg_mem.add_argument("--max-replay-size", type=int, default=500000,
                         help='Max size of the replay memory.')

    arg_net = parser.add_argument_group('Deep Q-Network')
    arg_net.add_argument("--optimizer",
                         choices=['adadelta',
                                  'adam',
                                  'rmsprop',
                                  'rmspropcentered'],
                         default='adam',
                         help='Name of the optimizer to use.')
    arg_net.add_argument("--learning-rate", type=float, default=.00025,
                         help='Learning rate value of the optimizer.')
    arg_net.add_argument("--decay", type=float, default=.95,
                         help='Discount factor for the history coming from the'
                              'gradient momentum in rmspropcentered and'
                              'rmsprop')
    arg_net.add_argument("--epsilon", type=float, default=.01,
                         help='Epsilon term used in rmspropcentered and'
                              'rmsprop')

    arg_alg = parser.add_argument_group('Algorithm')
    arg_alg.add_argument("--algorithm", choices=['dqn', 'ddqn', 'adqn'],
                         default='dqn',
                         help='Name of the algorithm. dqn is for standard'
                              'DQN, ddqn is for Double DQN and adqn is for'
                              'Averaged DQN.')
    arg_alg.add_argument("--n-approximators", type=int, default=1,
                         help="Number of approximators used in the ensemble for"
                              "Averaged DQN.")
    arg_alg.add_argument("--batch-size", type=int, default=32,
                         help='Batch size for each fit of the network.')
    arg_alg.add_argument("--history-length", type=int, default=4,
                         help='Number of frames composing a state.')
    arg_alg.add_argument("--target-update-frequency", type=int, default=10000,
                         help='Number of collected samples before each update'
                              'of the target network.')
    arg_alg.add_argument("--evaluation-frequency", type=int, default=250000,
                         help='Number of collected samples before each'
                              'evaluation. An epoch ends after this number of'
                              'steps')
    arg_alg.add_argument("--train-frequency", type=int, default=4,
                         help='Number of collected samples before each fit of'
                              'the neural network.')
    arg_alg.add_argument("--max-steps", type=int, default=50000000,
                         help='Total number of collected samples.')
    arg_alg.add_argument("--final-exploration-frame", type=int, default=1000000,
                         help='Number of collected samples until the exploration'
                              'rate stops decreasing.')
    arg_alg.add_argument("--initial-exploration-rate", type=float, default=1.,
                         help='Initial value of the exploration rate.')
    arg_alg.add_argument("--final-exploration-rate", type=float, default=.1,
                         help='Final value of the exploration rate. When it'
                              'reaches this values, it stays constant.')
    arg_alg.add_argument("--test-exploration-rate", type=float, default=.05,
                         help='Exploration rate used during evaluation.')
    arg_alg.add_argument("--test-samples", type=int, default=125000,
                         help='Number of collected samples for each'
                              'evaluation.')
    arg_alg.add_argument("--max-no-op-actions", type=int, default=8,
                         help='Maximum number of no-op actions performed at the'
                              'beginning of the episodes. The minimum number is'
                              'history_length. This number is reported to be 30'
                              'in the DQN Deepmind paper but, since they'
                              'consider the first 30 frames without frame'
                              'skipping and that the number of skipped frames'
                              'is generally 4, we set it to 8.')
    arg_alg.add_argument("--no-op-action-value", type=int, default=0,
                         help='Value of the no-op action.')

    arg_utils = parser.add_argument_group('Utils')
    arg_utils.add_argument('--device', type=int, default=None,
                           help='ID of the GPU device to use. If None, CPU is'
                                'used.')
    arg_utils.add_argument('--load-path', type=str,
                           help='Path of the model to be loaded.')
    arg_utils.add_argument('--save', action='store_true',
                           help='Flag specifying whether to save the model.')
    arg_utils.add_argument('--render', action='store_true',
                           help='Flag specifying whether to render the game.')
    arg_utils.add_argument('--quiet', action='store_true',
                           help='Flag specifying whether to hide the progress'
                                'bar.')
    arg_utils.add_argument('--debug', action='store_true',
                           help='Flag specifying whether the script has to be'
                                'run in debug mode.')

    args = parser.parse_args()

    scores = list()

    optimizer = dict()
    if args.optimizer == 'adam':
        optimizer['class'] = optim.Adam
        optimizer['params'] = dict(lr=args.learning_rate)
    elif args.optimizer == 'adadelta':
        optimizer['class'] = optim.Adadelta
        optimizer['params'] = dict(lr=args.learning_rate)
    elif args.optimizer == 'rmsprop':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon)
    elif args.optimizer == 'rmspropcentered':
        optimizer['class'] = optim.RMSprop
        optimizer['params'] = dict(lr=args.learning_rate,
                                   alpha=args.decay,
                                   eps=args.epsilon,
                                   centered=True)
    else:
        raise ValueError

    # Evaluation of the model provided by the user.
    if args.load_path:
        # MDP
        mdp = Atari(args.name, args.screen_width, args.screen_height,
                    ends_at_life=False)

        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = EpsGreedy(epsilon=epsilon_test)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            network=Network,
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n,),
            n_actions=mdp.info.action_space.n,
            load_path=args.load_path,
            optimizer=optimizer,
            loss=F.smooth_l1_loss,
            device=args.device
        )

        approximator = PyTorchApproximator

        # Agent
        algorithm_params = dict(
            batch_size=1,
            train_frequency=1,
            target_update_frequency=1,
            initial_replay_size=0,
            max_replay_size=0,
            history_length=args.history_length,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            dtype=np.uint8
        )
        agent = DQN(approximator, pi, mdp.info,
                    approximator_params=approximator_params, **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run

        # Summary folder
        folder_name = './logs/atari_' + args.algorithm + '_' + args.name +\
            '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        pathlib.Path(folder_name).mkdir(parents=True)

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        mdp = Atari(args.name, args.screen_width, args.screen_height,
                    ends_at_life=True)

        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1)
        pi = EpsGreedy(epsilon=epsilon_random)

        # Approximator
        input_shape = (args.screen_height, args.screen_width,
                       args.history_length)
        approximator_params = dict(
            network=Network,
            input_shape=input_shape,
            output_shape=(mdp.info.action_space.n,),
            n_actions=mdp.info.action_space.n,
            folder_name=folder_name,
            optimizer=optimizer,
            loss=F.smooth_l1_loss,
            device=args.device
        )

        approximator = PyTorchApproximator

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            n_approximators=args.n_approximators,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            history_length=args.history_length,
            target_update_frequency=target_update_frequency//train_frequency,
            max_no_op_actions=args.max_no_op_actions,
            no_op_action_value=args.no_op_action_value,
            dtype=np.uint8
        )

        if args.algorithm == 'dqn':
            agent = DQN(approximator, pi, mdp.info,
                        approximator_params=approximator_params,
                        **algorithm_params)
        elif args.algorithm == 'ddqn':
            agent = DoubleDQN(approximator, pi, mdp.info,
                              approximator_params=approximator_params,
                              **algorithm_params)
        elif args.algorithm == 'adqn':
            agent = AveragedDQN(approximator, pi, mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        # Algorithm
        core = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset
        print_epoch(0)
        core.learn(n_steps=initial_replay_size,
                   n_steps_per_fit=initial_replay_size, quiet=args.quiet)

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        pi.set_epsilon(epsilon_test)
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.target_approximator)
        mdp.set_episode_end(False)
        dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                quiet=args.quiet)
        scores.append(get_stats(dataset))
        if args.algorithm == 'ddqn':
            agent.policy.set_q(agent.approximator)

        np.save(folder_name + '/scores.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency, quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            pi.set_epsilon(epsilon_test)
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.target_approximator)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples, render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))
            if args.algorithm == 'ddqn':
                agent.policy.set_q(agent.approximator)

            np.save(folder_name + '/scores.npy', scores)

    return scores
Ejemplo n.º 44
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearDecayParameter(value=1., min_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = (1,) + mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={'class': optim.Adam,
                                          'params': {'lr': .001}},
                               loss=F.smooth_l1_loss, n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(PyTorchApproximator, pi, mdp.info,
                approximator_params=approximator_params, batch_size=batch_size,
                n_approximators=1, initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size, history_length=1,
                target_update_frequency=target_update_frequency,
                max_no_op_actions=0, no_op_action_value=0, dtype=np.float32)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    print('J: ', np.mean(J))

    for n in range(n_epochs):
        print('Epoch: ', n)
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        print('J: ', np.mean(J))

    print('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)