Beispiel #1
0
def test_tensor():
    low = np.array([0., -.5])
    high = np.array([1., .5])
    rbf = PyTorchGaussianRBF.generate([3, 3], low, high)
    features = Features(tensor_list=rbf)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.allclose(features(x_i), y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.allclose(features(x_1, x_2), y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.allclose(features(x_i[0], x_i[1]), y[i])
Beispiel #2
0
def test_tiles():
    tilings = Tiles.generate(3, [3, 3],
                             np.array([0., -.5]),
                             np.array([1., .5]))
    features = Features(tilings=tilings)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.all(features(x_i) == y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.all(features(x_1, x_2) == y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.all(features(x_i[0], x_i[1]) == y[i])
Beispiel #3
0
def test_basis():
    low = np.array([0., -.5])
    high = np.array([1., .5])
    rbf = GaussianRBF.generate([3, 3], high, low)
    features = Features(basis_list=rbf)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.all(features(x_i) == y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.all(features(x_1, x_2) == y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.all(features(x_i[0], x_i[1]) == y[i])
Beispiel #4
0
def test_sac():
    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)
    agent = StochasticAC(policy,
                         mdp.info,
                         alpha_theta,
                         alpha_v,
                         lambda_par=.5,
                         value_function_features=psi,
                         policy_features=phi)
    agent.fit(dataset)

    w_1 = .09370429
    w_2 = .28141735

    w = agent._V.get_weights()

    assert np.allclose(w_1, w[65])
    assert np.allclose(w_2, w[78])
Beispiel #5
0
def experiment():
    np.random.seed()

    # MDP
    mdp = InvertedPendulum()

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    rbfs = GaussianRBF.generate(10, [10, 10], mdp.info.observation_space.low,
                                mdp.info.observation_space.high)
    features = Features(basis_list=rbfs)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = dict(n_iterations)
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = LSPI(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=20)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=20)

    return np.mean(compute_J(dataset, 1.))
Beispiel #6
0
def test_copdac_q():
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 1
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    w = agent.policy.get_weights()
    w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2])

    assert np.allclose(w, w_test)
Beispiel #7
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(pi,
                                  mdp.info,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=40, n_steps_per_fit=1, render=False)
    dataset = core.evaluate(n_episodes=1, render=True)

    return np.mean(compute_J(dataset, 1.))
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks=callbacks)

    # Train
    core.learn(n_episodes=20, n_steps_per_fit=1, render=0)

    dataset = collect_dataset.get()
    return np.mean(compute_J(dataset, 1.))
Beispiel #9
0
def test_sac_avg():
    alpha_r = Parameter(.0001)
    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)
    agent = StochasticAC_AVG(policy,
                             mdp.info,
                             alpha_theta,
                             alpha_v,
                             alpha_r,
                             lambda_par=.5,
                             value_function_features=psi,
                             policy_features=phi)
    agent.fit(dataset)

    w_1 = .09645764
    w_2 = .28583057

    w = agent._V.get_weights()

    assert np.allclose(w_1, w[65])
    assert np.allclose(w_2, w[78])
Beispiel #10
0
from mushroom.policy import DeterministicPolicy
from mushroom.utils.parameters import AdaptiveParameter

mdp = ShipSteering()

high = [150, 150, np.pi]
low = [0, 0, -np.pi]
n_tiles = [5, 5, 6]
low = np.array(low, dtype=np.float)
high = np.array(high, dtype=np.float)
n_tilings = 1

tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low,
                         high=high)

phi = Features(tilings=tilings)
input_shape = (phi.size,)

approximator = Regressor(LinearApproximator, input_shape=input_shape,
                         output_shape=mdp.info.action_space.shape)

policy = DeterministicPolicy(approximator)

mu = np.zeros(policy.weights_size)
sigma = 4e-1 * np.ones(policy.weights_size)
distribution_test = GaussianDiagonalDistribution(mu, sigma)
agent_test = RWR(distribution_test, policy, mdp.info, beta=1.)
core = Core(agent_test, mdp)

s = np.arange(10)
a = np.arange(10)
Beispiel #11
0
from mushroom.policy import EpsGreedy
from mushroom.utils.callbacks import CollectDataset
from mushroom.utils.parameters import Parameter

# MDP
mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

# Policy
epsilon = Parameter(value=0.)
pi = EpsGreedy(epsilon=epsilon)

# Q-function approximator
n_tilings = 10
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

approximator_params = dict(input_shape=(features.size, ),
                           output_shape=(mdp.info.action_space.n, ),
                           n_actions=mdp.info.action_space.n)

# Agent
learning_rate = Parameter(.1 / n_tilings)
algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
fit_params = dict()
agent_params = {
    'approximator_params': approximator_params,
    'algorithm_params': algorithm_params,
    'fit_params': fit_params
}
agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info, agent_params,
Beispiel #12
0
def learn(alg):
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 2
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings - 1, [1, 1],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator,
                    input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    if alg is StochasticAC:
        agent = alg(policy,
                    mdp.info,
                    alpha_theta,
                    alpha_v,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)
    elif alg is StochasticAC_AVG:
        agent = alg(policy,
                    mdp.info,
                    alpha_theta,
                    alpha_v,
                    alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return policy