コード例 #1
0
def test_basis_and_tensors():
    low = np.array([0., -.5])
    high = np.array([1., .5])
    basis_rbf = GaussianRBF.generate([3, 3], low, high)
    tensor_rbf = GaussianRBFTensor.generate([3, 3], low, high)
    features_1 = Features(tensor_list=tensor_rbf)
    features_2 = Features(basis_list=basis_rbf)

    x = np.random.rand(10, 2) + [0., -.5]

    y_1 = features_1(x)
    y_2 = features_2(x)

    assert np.allclose(y_1, y_2)
コード例 #2
0
def test_tensor():
    low = np.array([0., -.5])
    high = np.array([1., .5])
    rbf = GaussianRBFTensor.generate([3, 3], low, high)
    rbf += RandomFourierBasis.generate(0.1, 6, 2)
    features = Features(tensor_list=rbf)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    assert y.shape == (10, 15)

    for i, x_i in enumerate(x):
        assert np.allclose(features(x_i), y[i])

    assert np.all(y[:, -1] == 1)

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.allclose(features(x_1, x_2), y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.allclose(features(x_i[0], x_i[1]), y[i])
コード例 #3
0
ファイル: test_td.py プロジェクト: PuzeLiu/mushroom-rl
def test_true_online_sarsa_lambda():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    agent = TrueOnlineSARSALambda(mdp_continuous.info,
                                  pi,
                                  Parameter(.1),
                                  .9,
                                  features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([
        -17.30427303, 0., -13.54157504, 0., -16.82373134, 0., -10.29613337, 0.,
        -14.79470382, 0., -10.50654665, 0.
    ])

    assert np.allclose(agent.Q.get_weights(), test_w)
コード例 #4
0
ファイル: test_td.py プロジェクト: PuzeLiu/mushroom-rl
def test_sarsa_lambda_continuous_linear():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    agent = SARSALambdaContinuous(mdp_continuous.info,
                                  pi,
                                  LinearApproximator,
                                  Parameter(.1),
                                  .9,
                                  features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([
        -16.62627886, 0., -13.03033079, 0., -15.93237930, 0., -9.72299176, 0.,
        -13.78884631, 0., -9.92157645, 0.
    ])

    assert np.allclose(agent.Q.get_weights(), test_w)
コード例 #5
0
def learn_lspi():
    np.random.seed(1)

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_episodes_per_fit=10)

    return agent
コード例 #6
0
ファイル: test_td.py プロジェクト: yanxg/mushroom-rl
def test_sarsa_lambda_continuous_linear():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0.,
                       -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
コード例 #7
0
def test_tiles_voronoi():
    tilings_list = [
        VoronoiTiles.generate(3,
                              10,
                              low=np.array([0., -.5]),
                              high=np.array([1., .5])),
        VoronoiTiles.generate(3,
                              10,
                              mu=np.array([.5, -.5]),
                              sigma=np.array([.2, .6]))
    ]

    for tilings in tilings_list:
        features = Features(tilings=tilings)

        x = np.random.rand(10, 2) + [0., -.5]

        y = features(x)

        for i, x_i in enumerate(x):
            assert np.all(features(x_i) == y[i])

        x_1 = x[:, 0].reshape(-1, 1)
        x_2 = x[:, 1].reshape(-1, 1)

        assert np.all(features(x_1, x_2) == y)

        for i, x_i in enumerate(zip(x_1, x_2)):
            assert np.all(features(x_i[0], x_i[1]) == y[i])
            assert features.size == y[i].size
コード例 #8
0
ファイル: test_td.py プロジェクト: yanxg/mushroom-rl
def test_sarsa_lambda_continuous_nn():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)

    features = Features(
        n_outputs=mdp_continuous.info.observation_space.shape[0]
    )

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        network=Network,
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884,
                       -0.12784956, -0.10572472, -0.14546978, -0.67001086,
                       -0.93925357])

    assert np.allclose(agent.Q.get_weights(), test_w)
コード例 #9
0
ファイル: test_td.py プロジェクト: k4ntz/mushroom-rl
def test_true_online_sarsa_lambda_save(tmpdir):
    agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f"))

    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent_save, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    agent_save.save(agent_path)
    agent_load = Agent.load(agent_path)

    for att, method in vars(agent_save).items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)

        tu.assert_eq(save_attr, load_attr)
コード例 #10
0
ファイル: test_td.py プロジェクト: k4ntz/mushroom-rl
def test_sarsa_lambda_continuous_nn_save(tmpdir):
    agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f"))

    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)

    features = Features(
        n_outputs=mdp_continuous.info.observation_space.shape[0]
    )

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        network=Network,
        n_actions=mdp_continuous.info.action_space.n
    )
    agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent_save, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    agent_save.save(agent_path)
    agent_load = Agent.load(agent_path)

    for att, method in vars(agent_save).items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)

        tu.assert_eq(save_attr, load_attr)
コード例 #11
0
def test_lspi():
    np.random.seed(1)

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_episodes_per_fit=10)

    w = agent.approximator.get_weights()
    w_test = np.array([-1.00749128, -1.13444655, -0.96620322])

    assert np.allclose(w, w_test)
コード例 #12
0
ファイル: test_td.py プロジェクト: yanxg/mushroom-rl
def test_true_online_sarsa_lambda():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = TrueOnlineSARSALambda(mdp_continuous.info, pi,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0.,
                       -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
コード例 #13
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(COPDAC_Q.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high, phi, phi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
コード例 #14
0
ファイル: test_features.py プロジェクト: PuzeLiu/mushroom-rl
def test_random_fourier():
    np.random.seed(1)
    tensor_list = RandomFourierBasis.generate(nu=2.5, n_output=10, input_size=2)

    x = np.array([0.1, 1.4])

    features = Features(tensor_list=tensor_list)

    res = np.array([0.33279073,  0.84292346,  0.03078904, -0.98234737,  0.8367746,
                    0.476112,  0.4179958,  0.99205977,  0.5216869,  1.])

    assert np.allclose(features(x), res)
    assert features.size == res.size
コード例 #15
0
ファイル: cmac.py プロジェクト: ml-research/mushroom-rl
    def __init__(self, tilings, weights=None, output_shape=(1, ), **kwargs):
        """
        Constructor.

        Args:
            tilings (list): list of tilings to discretize the input space.
            weights (np.ndarray): array of weights to initialize the weights
                of the approximator;
            input_shape (np.ndarray, None): the shape of the input of the
                model;
            output_shape (np.ndarray, (1,)): the shape of the output of the
                model;
            **kwargs: other params of the approximator.

        """
        self._phi = Features(tilings=tilings)
        self._n = len(tilings)

        super().__init__(weights=weights,
                         input_shape=(self._phi.size, ),
                         output_shape=output_shape)

        self._add_save_attr(_phi='pickle', _n='primitive')
コード例 #16
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))
コード例 #17
0
def test_copdac_q():
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 1
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    w = agent.policy.get_weights()
    w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2])

    assert np.allclose(w, w_test)
コード例 #18
0
def test_fourier():
    low = np.array([-1.0, 0.5])
    high = np.array([1.0, 2.5])
    basis_list = FourierBasis.generate(low, high, 5)

    features = Features(basis_list=basis_list)

    x = np.array([0.1, 1.4])

    res = np.array([
        1., -0.15643447, -0.95105652, 0.4539905, 0.80901699, -0.70710678,
        0.15643447, -1., 0.15643447, 0.95105652, -0.4539905, -0.80901699,
        -0.95105652, -0.15643447, 1., -0.15643447, -0.95105652, 0.4539905,
        -0.4539905, 0.95105652, 0.15643447, -1., 0.15643447, 0.95105652,
        0.80901699, 0.4539905, -0.95105652, -0.15643447, 1., -0.15643447,
        0.70710678, -0.80901699, -0.4539905, 0.95105652, 0.15643447, -1.
    ])

    assert np.allclose(features(x), res)
コード例 #19
0
def test_tiles():
    tilings = Tiles.generate(3, [3, 3], np.array([0., -.5]), np.array([1.,
                                                                       .5]))
    features = Features(tilings=tilings)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.all(features(x_i) == y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.all(features(x_1, x_2) == y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.all(features(x_i[0], x_i[1]) == y[i])
コード例 #20
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(mdp.info,
                                  pi,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    #shape = agent.approximator.Q
    #print(agent.Q)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_steps_per_fit=1, render=True)
    dataset = core.evaluate(n_episodes=1, render=False)
    #print(dataset)
    print(episodes_length(dataset))

    return np.mean(compute_J(dataset, .96))
コード例 #21
0
def test_tensor():
    low = np.array([0., -.5])
    high = np.array([1., .5])
    rbf = PyTorchGaussianRBF.generate([3, 3], low, high)
    features = Features(tensor_list=rbf)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.allclose(features(x_i), y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.allclose(features(x_1, x_2), y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.allclose(features(x_i[0], x_i[1]), y[i])
コード例 #22
0
def test_basis():
    low = np.array([0., -.5])
    high = np.array([1., .5])
    rbf = GaussianRBF.generate([3, 3], high, low)
    features = Features(basis_list=rbf)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.all(features(x_i) == y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.all(features(x_1, x_2) == y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.all(features(x_i[0], x_i[1]) == y[i])
コード例 #23
0
def test_sarsa_lambda_continuous_linear_save():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    agent_save = SARSALambdaContinuous(mdp_continuous.info,
                                       pi,
                                       LinearApproximator,
                                       Parameter(.1),
                                       .9,
                                       features=features,
                                       approximator_params=approximator_params)

    core = Core(agent_save, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    agent_path = './agentdir{}/'.format(datetime.now().strftime("%H%M%S%f"))

    agent_save.save(agent_path)
    agent_load = Agent.load(agent_path)

    shutil.rmtree(agent_path)

    for att, method in agent_save.__dict__.items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)
        #print('{}: {}'.format(att, type(save_attr)))

        tu.assert_eq(save_attr, load_attr)
コード例 #24
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings-1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator, input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = StochasticAC_AVG(mdp.info, policy,
                             alpha_theta, alpha_v, alpha_r,
                             lambda_par=.5,
                             value_function_features=psi,
                             policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high,
                               phi, psi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps/n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
コード例 #25
0
ファイル: cmac.py プロジェクト: ml-research/mushroom-rl
class CMAC(LinearApproximator):
    """
    This class implements a Cerebellar Model Arithmetic Computer.


    """
    def __init__(self, tilings, weights=None, output_shape=(1, ), **kwargs):
        """
        Constructor.

        Args:
            tilings (list): list of tilings to discretize the input space.
            weights (np.ndarray): array of weights to initialize the weights
                of the approximator;
            input_shape (np.ndarray, None): the shape of the input of the
                model;
            output_shape (np.ndarray, (1,)): the shape of the output of the
                model;
            **kwargs: other params of the approximator.

        """
        self._phi = Features(tilings=tilings)
        self._n = len(tilings)

        super().__init__(weights=weights,
                         input_shape=(self._phi.size, ),
                         output_shape=output_shape)

        self._add_save_attr(_phi='pickle', _n='primitive')

    def fit(self, x, y, alpha=1.0, **kwargs):
        """
        Fit the model.

        Args:
            x (np.ndarray): input;
            y (np.ndarray): target;
            alpha (float): learning rate;
            **kwargs: other parameters used by the fit method of the
                regressor.

        """
        y_hat = self.predict(x)
        delta_y = np.atleast_2d(y - y_hat)
        if self._w.shape[0] > 1:
            delta_y = delta_y.T

        phi = np.atleast_2d(self._phi(x))
        sum_phi = np.sum(phi, axis=0)
        n = np.sum(phi, axis=1, keepdims=True)
        phi_n = phi / n
        sum_phi[sum_phi == 0] = 1.

        delta_w = delta_y @ phi_n / sum_phi
        self._w += alpha * delta_w

    def predict(self, x, **predict_params):
        """
        Predict.

        Args:
            x (np.ndarray): input;
            **predict_params: other parameters used by the predict method
                the regressor.

        Returns:
            The predictions of the model.

        """
        prediction = np.ones((x.shape[0], self._w.shape[0]))
        indexes = self._phi.compute_indexes(x)

        if x.shape[0] == 1:
            indexes = list([indexes])

        for i, idx in enumerate(indexes):
            prediction[i] = np.sum(self._w[:, idx], axis=-1)

        return prediction.squeeze()

    def diff(self, state, action=None):
        """
        Compute the derivative of the output w.r.t. ``state``, and ``action``
        if provided.

        Args:
            state (np.ndarray): the state;
            action (np.ndarray, None): the action.

        Returns:
            The derivative of the output w.r.t. ``state``, and ``action``
            if provided.

        """

        phi = self._phi(state)
        return super().diff(phi, action)
コード例 #26
0
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.callbacks import CollectDataset
from mushroom_rl.utils.parameters import Parameter

# MDP
mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

# Policy
epsilon = Parameter(value=0.)
pi = EpsGreedy(epsilon=epsilon)

# Q-function approximator
n_tilings = 10
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

# Agent
learning_rate = Parameter(.1 / n_tilings)
approximator_params = dict(input_shape=(features.size, ),
                           output_shape=(mdp.info.action_space.n, ),
                           n_actions=mdp.info.action_space.n)
agent = SARSALambdaContinuous(mdp.info,
                              pi,
                              LinearApproximator,
                              approximator_params=approximator_params,
                              learning_rate=learning_rate,
                              lambda_coeff=.9,
                              features=features)

# Algorithm
コード例 #27
0
def learn(alg):
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 2
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings - 1, [1, 1],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator,
                    input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    if alg is StochasticAC:
        agent = alg(mdp.info,
                    policy,
                    alpha_theta,
                    alpha_v,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)
    elif alg is StochasticAC_AVG:
        agent = alg(mdp.info,
                    policy,
                    alpha_theta,
                    alpha_v,
                    alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return agent