Esempio n. 1
0
def test_pytorch_approximator():
    np.random.seed(88)
    torch.manual_seed(88)

    noise = 1e-3**2

    a = np.random.rand(1000, 4)

    k = np.random.rand(4, 2)
    b = np.sin(a).dot(k) + np.random.randn(1000, 2) * noise

    approximator = Regressor(PyTorchApproximator,
                             input_shape=(4, ),
                             output_shape=(2, ),
                             network=ExampleNet,
                             optimizer={
                                 'class': optim.Adam,
                                 'params': {}
                             },
                             loss=F.mse_loss,
                             n_neurons=100,
                             n_hidden=1,
                             n_epochs=200,
                             batch_size=100,
                             quiet=True)

    approximator.fit(a, b)

    bhat = approximator.predict(a)
    error = np.linalg.norm(b - bhat, 'fro') / 1000
    error_inf = np.max(np.abs(b - bhat))

    print(b[:10])

    print(bhat[:10])

    print(error_inf)

    assert error < 2e-4

    gradient = approximator.diff(a[0])
    assert gradient.shape[1] == 2

    old_weights = approximator.get_weights()
    approximator.set_weights(old_weights)
    new_weights = approximator.get_weights()

    assert np.array_equal(new_weights, old_weights)

    random_weights = np.random.randn(*old_weights.shape).astype(np.float32)
    approximator.set_weights(random_weights)
    random_weight_new = approximator.get_weights()

    assert np.array_equal(random_weights, random_weight_new)
    assert not np.any(np.equal(random_weights, old_weights))

    bhat_random = approximator.predict(a)

    assert not np.array_equal(bhat, bhat_random)
def test_pytorch_approximator():
    np.random.seed(1)
    torch.manual_seed(1)

    n_actions = 2
    s = np.random.rand(1000, 4)
    a = np.random.randint(n_actions, size=(1000, 1))
    q = np.random.rand(1000)

    approximator = Regressor(TorchApproximator, input_shape=(4,),
                             output_shape=(2,), n_actions=n_actions,
                             network=ExampleNet,
                             optimizer={'class': optim.Adam,
                                        'params': {}}, loss=F.mse_loss,
                             batch_size=100, quiet=True)

    approximator.fit(s, a, q, n_epochs=20)

    x_s = np.random.rand(2, 4)
    x_a = np.random.randint(n_actions, size=(2, 1))
    y = approximator.predict(x_s, x_a)
    y_test = np.array([0.37191153, 0.5920861])

    assert np.allclose(y, y_test)

    y = approximator.predict(x_s)
    y_test = np.array([[0.47908658, 0.37191153],
                       [0.5920861, 0.27575058]])

    assert np.allclose(y, y_test)

    gradient = approximator.diff(x_s[0], x_a[0])
    gradient_test = np.array([0., 0., 0., 0., 0.02627479, 0.76513696,
                              0.6672573, 0.35979462, 0., 1.])
    assert np.allclose(gradient, gradient_test)

    gradient = approximator.diff(x_s[0])
    gradient_test = np.array([[0.02627479, 0.], [0.76513696, 0.],
                              [0.6672573, 0.], [0.35979462, 0.],
                              [0., 0.02627479], [0., 0.76513696],
                              [0., 0.6672573], [0., 0.35979462], [1, 0.],
                              [0., 1.]])
    assert np.allclose(gradient, gradient_test)

    old_weights = approximator.get_weights()
    approximator.set_weights(old_weights)
    new_weights = approximator.get_weights()

    assert np.array_equal(new_weights, old_weights)

    random_weights = np.random.randn(*old_weights.shape).astype(np.float32)
    approximator.set_weights(random_weights)
    random_weight_new = approximator.get_weights()

    assert np.array_equal(random_weights, random_weight_new)
    assert not np.any(np.equal(random_weights, old_weights))
def build_high_level_agent(alg, params, mdp, mu, sigma):
    features = Features(basis_list=[PolynomialBasis()])
    approximator = Regressor(LinearApproximator,
                             input_shape=(features.size, ),
                             output_shape=(2, ))
    approximator.set_weights(mu)

    pi1 = DiagonalGaussianPolicy(mu=approximator, std=sigma)

    lim = mdp.info.observation_space.high[0]
    mdp_info_agent = MDPInfo(observation_space=mdp.info.observation_space,
                             action_space=spaces.Box(0, lim, (2, )),
                             gamma=1.0,
                             horizon=100)
    agent = alg(pi1, mdp_info_agent, features=features, **params)

    return agent
Esempio n. 4
0
def build_high_level_agent(alg, params, mdp, mu, std):
    tilings = Tiles.generate(n_tilings=1,
                             n_tiles=[10, 10],
                             low=mdp.info.observation_space.low[:2],
                             high=mdp.info.observation_space.high[:2])
    features = Features(tilings=tilings)

    input_shape = (features.size, )

    mu_approximator = Regressor(LinearApproximator,
                                input_shape=input_shape,
                                output_shape=(1, ))
    std_approximator = Regressor(LinearApproximator,
                                 input_shape=input_shape,
                                 output_shape=(1, ))

    w_mu = mu * np.ones(mu_approximator.weights_size)
    mu_approximator.set_weights(w_mu)

    w_std = std * np.ones(std_approximator.weights_size)
    mu_approximator.set_weights(w_std)

    pi = StateLogStdGaussianPolicy(mu=mu_approximator,
                                   log_std=std_approximator)

    obs_low = np.array(
        [mdp.info.observation_space.low[0], mdp.info.observation_space.low[1]])
    obs_high = np.array([
        mdp.info.observation_space.high[0], mdp.info.observation_space.high[1]
    ])
    mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(obs_low,
                                                           obs_high,
                                                           shape=(2, )),
                              action_space=spaces.Box(
                                  mdp.info.observation_space.low[2],
                                  mdp.info.observation_space.high[2],
                                  shape=(1, )),
                              gamma=1,
                              horizon=10)
    agent = alg(policy=pi,
                mdp_info=mdp_info_agent1,
                features=features,
                **params)

    return agent
def experiment(alg, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = alg(policy, mdp.info, **algorithm_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    print('policy parameters: ', policy.get_weights())
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        print('policy parameters: ', policy.get_weights())
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
Esempio n. 6
0
def build_mid_level_agent(alg, params, mdp, mu, std):
    mu_approximator = Regressor(LinearApproximator,
                                input_shape=(1, ),
                                output_shape=(2, ))

    w_mu = mu * np.ones(mu_approximator.weights_size)
    mu_approximator.set_weights(w_mu)

    pi = DiagonalGaussianPolicy(mu=mu_approximator, std=std * np.ones(2))

    lim = mdp.info.observation_space.high[0]
    basis = PolynomialBasis()
    features = BasisFeatures(basis=[basis])
    mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(0, 1, (1, )),
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=1,
                              horizon=10)
    agent = alg(policy=pi,
                mdp_info=mdp_info_agent1,
                features=features,
                **params)

    return agent
Esempio n. 7
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=pos_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([255, 255])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([500, 500]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1)

    # Policy 2
    pi2 = DeterministicControlPolicy(weights=np.array([0]))
    mu2 = np.zeros(pi2.weights_size)
    sigma2 = 1e-3 * np.ones(pi2.weights_size)
    distribution2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 1
    learning_rate1 = params.get('learning_rate_high')
    lim = 1000
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent1 = alg_high(policy=pi1,
                      mdp_info=mdp_info_agent1,
                      learning_rate=learning_rate1,
                      features=features)

    # Agent 2
    learning_rate2 = params.get('learning_rate_low')
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = alg_low(distribution=distribution2,
                     policy=pi2,
                     mdp_info=mdp_info_agent2,
                     learning_rate=learning_rate2)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(distribution2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=10,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)

    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block2)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        low_level_dataset_eval += control_block2.dataset.get()

    # Save
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset_file',
            low_level_dataset_eval)
    np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1)
    np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
Esempio n. 8
0
def experiment():
    np.random.seed()

    # Model Block
    mdp = ShipSteeringMultiGate()

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)', phi=phi)

    # Function Block 2
    function_block2 = squarednormBlock(name='f2 (squared norm)')

    # Function Block 3
    function_block3 = addBlock(name='f3 (summation)')

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([38, 38])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([75, 75]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1)

    # Policy 2
    sigma2 = Parameter(value=.01)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(1, ),
                              output_shape=mdp.info.action_space.shape)
    pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2)

    # Agent 1
    learning_rate = AdaptiveParameter(value=10)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, 150, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=50)
    agent1 = GPOMDP(policy=pi1,
                    mdp_info=mdp_info_agent1,
                    params=agent_params,
                    features=features)

    # Agent 2
    learning_rate = AdaptiveParameter(value=.001)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = GPOMDP(policy=pi2,
                    mdp_info=mdp_info_agent2,
                    params=agent_params,
                    features=None)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=5,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    dataset_callback = CollectDataset()
    parameter_callback2 = CollectPolicyParameter(pi2)
    control_block2 = ControlBlock(
        name='Control Block 2',
        agent=agent2,
        n_eps_per_fit=10,
        callbacks=[dataset_callback, parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, control_block1, control_block2, function_block1,
        function_block2, function_block3, reward_acc
    ]
    #order = [0, 1, 7, 2, 4, 5, 6, 3]
    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)
    function_block3.add_input(function_block2)
    function_block3.add_input(reward_ph)
    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block3)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    #dataset_learn_visual = core.learn(n_episodes=2000)
    dataset_learn_visual = list()
    for n in range(4):
        dataset_learn = core.learn(n_episodes=500)
        last_ep_dataset = pick_last_ep(dataset_learn)
        dataset_learn_visual += last_ep_dataset
        del dataset_learn

    # Evaluate
    dataset_eval = core.evaluate(n_episodes=10)

    # Visualize
    low_level_dataset = dataset_callback.get()
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()
    visualize_policy_params(parameter_dataset1, parameter_dataset2)
    visualize_control_block(low_level_dataset, ep_count=20)
    visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4)

    visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4)
    plt.show()

    return
def test_linear_approximator():
    np.random.seed(1)

    # Generic regressor
    a = np.random.rand(1000, 3)

    k = np.random.rand(3, 2)
    b = a.dot(k) + np.random.randn(1000, 2)

    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             output_shape=(2, ))

    approximator.fit(a, b)

    x = np.random.rand(2, 3)
    y = approximator.predict(x)
    y_test = np.array([[0.57638247, 0.1573216], [0.11388247, 0.24123678]])

    assert np.allclose(y, y_test)

    point = np.random.randn(3, )
    derivative = approximator.diff(point)

    lp = len(point)
    for i in range(derivative.shape[1]):
        assert (derivative[i * lp:(i + 1) * lp, i] == point).all()

    old_weights = approximator.get_weights()
    approximator.set_weights(old_weights)
    new_weights = approximator.get_weights()

    assert np.array_equal(new_weights, old_weights)

    random_weights = np.random.randn(*old_weights.shape).astype(np.float32)
    approximator.set_weights(random_weights)
    random_weight_new = approximator.get_weights()

    assert np.array_equal(random_weights, random_weight_new)
    assert not np.any(np.equal(random_weights, old_weights))

    # Action regressor + Ensemble
    n_actions = 2
    s = np.random.rand(1000, 3)
    a = np.random.randint(n_actions, size=(1000, 1))
    q = np.random.rand(1000)

    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             n_actions=n_actions,
                             n_models=5)

    approximator.fit(s, a, q)

    x_s = np.random.rand(2, 3)
    x_a = np.random.randint(n_actions, size=(2, 1))
    y = approximator.predict(x_s, x_a, prediction='mean')
    y_test = np.array([0.49225698, 0.69660881])
    assert np.allclose(y, y_test)

    y = approximator.predict(x_s, x_a, prediction='sum')
    y_test = np.array([2.46128492, 3.48304404])
    assert np.allclose(y, y_test)

    y = approximator.predict(x_s, x_a, prediction='min')
    y_test = np.array([[0.49225698, 0.69660881]])
    assert np.allclose(y, y_test)

    y = approximator.predict(x_s)
    y_test = np.array([[0.49225698, 0.44154141], [0.69660881, 0.69060195]])
    assert np.allclose(y, y_test)

    approximator = Regressor(LinearApproximator,
                             input_shape=(3, ),
                             n_actions=n_actions)

    approximator.fit(s, a, q)

    gradient = approximator.diff(x_s[0], x_a[0])
    gradient_test = np.array([0.88471362, 0.11666548, 0.45466254, 0., 0., 0.])

    assert np.allclose(gradient, gradient_test)