Ejemplo n.º 1
0
def build_bbo_agent(alg, params, std, mdp):
    input_dim = mdp.info.observation_space.shape[0]
    mu = np.zeros(input_dim)
    sigma = std * np.ones(input_dim)
    policy = SegwayControlPolicy(mu)
    dist = GaussianDiagonalDistribution(mu, sigma)
    agent = alg(dist, policy, mdp.info, **params)

    return agent
Ejemplo n.º 2
0
def experiment(alg, params, subdir, exp_no):
    np.random.seed()

    # MDP
    mdp = ShipSteering(small=True, n_steps_action=3)

    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)
    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    approximator_params = dict(input_dim=input_shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(distribution, policy, mdp.info, features=phi, **params)

    # Train
    dataset_eval = list()
    core = Core(agent, mdp)
    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    #print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

    mk_dir_recursive('./' + subdir + str(exp_no))
    np.save(subdir + str(exp_no) + '/dataset_eval_file', dataset_eval)
Ejemplo n.º 3
0
def build_bbo_agent(alg, params, mdp):
    phi, approximator = build_approximator(mdp)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(distribution, policy, mdp.info, features=phi, **params)

    return agent
Ejemplo n.º 4
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(dist, policy, mdp.info, **params)

    # Train
    print(alg.__name__)
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        print('mu:    ', p[:n_weights])
        print('sigma: ', p[n_weights:])
        print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J)))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
def experiment(n_epochs, n_iteration, n_ep_per_fit, n_eval_run):
    np.random.seed()

    # MDP
    mdp = SegwayLinearMotion()

    input_dim = mdp.info.observation_space.shape[0]
    mu = np.zeros(input_dim)
    sigma = 2e-0 * np.ones(input_dim)
    policy = SegwayControlPolicy(mu)
    dist = GaussianDiagonalDistribution(mu, sigma)
    beta = 2e-3

    agent = RWR(dist, policy, mdp.info, beta)

    # Train
    core = Core(agent, mdp)

    dataset_eval = core.evaluate(n_episodes=n_eval_run, render=False)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start ', np.mean(J))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iteration * n_ep_per_fit,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)

        dataset_eval = core.evaluate(n_episodes=n_eval_run, render=False)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)

        p = dist.get_parameters()

        print('mu:    ', p[:input_dim])
        print('sigma: ', p[input_dim:])
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
Ejemplo n.º 6
0
def test_REPS():
    distribution = GaussianDiagonalDistribution(mu, sigma)
    agent = REPS(distribution, policy, mdp.info, eps=.7, features=phi)

    agent.episode_start()

    agent.fit(dataset)

    w_1 = .76179551
    w_2 = .08787432

    w = agent.policy.get_weights()

    assert np.allclose(w_1, w[10])
    assert np.allclose(w_2, w[18])
Ejemplo n.º 7
0
def test_RWR():
    distribution = GaussianDiagonalDistribution(mu, sigma)
    agent = RWR(distribution, policy, mdp.info, beta=1., features=phi)

    agent.episode_start()

    agent.fit(dataset)

    w_1 = 4.24574375e-1
    w_2 = -1.10809513e-1

    w = agent.policy.get_weights()

    assert np.allclose(w_1, w[10])
    assert np.allclose(w_2, w[18])
Ejemplo n.º 8
0
def experiment(alg, params, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(distribution, policy, mdp.info, features=phi, **params)

    # Train
    print(alg.__name__)
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
Ejemplo n.º 9
0
def test_PGPE():
    distribution = GaussianDiagonalDistribution(mu, sigma)
    agent = PGPE(distribution, policy, mdp.info,
                 learning_rate=AdaptiveParameter(1.5), features=phi)

    agent.episode_start()

    agent.fit(dataset)

    w_1 = .54454343
    w_2 = .5449792

    w = agent.policy.get_weights()

    assert np.allclose(w_1, w[10])
    assert np.allclose(w_2, w[18])
Ejemplo n.º 10
0
def build_low_level_agent(alg, params, mdp):
    features = Features(
        basis_list=[PolynomialBasis(dimensions=[0], degrees=[1])])

    pi = DeterministicControlPolicy(weights=np.array([0]))
    mu = np.zeros(pi.weights_size)
    sigma = 1e-3 * np.ones(pi.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent = alg(distribution, pi, mdp_info_agent2, features=features, **params)

    return agent
Ejemplo n.º 11
0
def learn(alg, **alg_params):
    np.random.seed(1)
    torch.manual_seed(1)

    # MDP
    mdp = LQR.generate(dimensions=2)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 1e-3 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    agent_test = alg(distribution, policy, mdp.info, **alg_params)
    core = Core(agent_test, mdp)

    core.learn(n_episodes=5, n_episodes_per_fit=5)

    return distribution
Ejemplo n.º 12
0
n_tilings = 1

tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low,
                         high=high)

phi = Features(tilings=tilings)
input_shape = (phi.size,)

approximator = Regressor(LinearApproximator, input_shape=input_shape,
                         output_shape=mdp.info.action_space.shape)

policy = DeterministicPolicy(approximator)

mu = np.zeros(policy.weights_size)
sigma = 4e-1 * np.ones(policy.weights_size)
distribution_test = GaussianDiagonalDistribution(mu, sigma)
agent_test = RWR(distribution_test, policy, mdp.info, beta=1.)
core = Core(agent_test, mdp)

s = np.arange(10)
a = np.arange(10)
r = np.arange(10)
ss = s + 5
ab = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
last = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

dataset = list()
for i in range(s.size):
    dataset.append([np.array([s[i]]), np.array([a[i]]), r[i],
                    np.array([ss[i]]), ab[i], last[i]])
Ejemplo n.º 13
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=pos_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([255, 255])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([500, 500]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1)

    # Policy 2
    pi2 = DeterministicControlPolicy(weights=np.array([0]))
    mu2 = np.zeros(pi2.weights_size)
    sigma2 = 1e-3 * np.ones(pi2.weights_size)
    distribution2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 1
    learning_rate1 = params.get('learning_rate_high')
    lim = 1000
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent1 = alg_high(policy=pi1,
                      mdp_info=mdp_info_agent1,
                      learning_rate=learning_rate1,
                      features=features)

    # Agent 2
    learning_rate2 = params.get('learning_rate_low')
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = alg_low(distribution=distribution2,
                     policy=pi2,
                     mdp_info=mdp_info_agent2,
                     learning_rate=learning_rate2)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(distribution2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=10,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)

    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block2)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        low_level_dataset_eval += control_block2.dataset.get()

    # Save
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset_file',
            low_level_dataset_eval)
    np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1)
    np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
Ejemplo n.º 14
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = SegwayLinearMotion(goal_distance=1.0)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Integrator Block
    error_acc = ErrorAccumulatorBlock(name='error acc')

    # Features
    features1 = Features(basis_list=[PolynomialBasis()])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features1.size, ),
                              output_shape=(1, ))

    # Policy 1
    n_weights = approximator1.weights_size
    mu1 = np.zeros(n_weights)
    sigma1 = 2e-0 * np.ones(n_weights)
    pi1 = DeterministicPolicy(approximator1)
    dist1 = GaussianDiagonalDistribution(mu1, sigma1)

    # Agent 1
    eps1 = params.get('eps')
    lim = 2 * np.pi
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (1, )),
                              gamma=mdp.info.gamma,
                              horizon=20)

    agent1 = alg_low(distribution=dist1,
                     policy=pi1,
                     features=features1,
                     mdp_info=mdp_info_agent1,
                     eps=eps1)

    # Policy 2
    basis = PolynomialBasis.generate(1, 3)
    features2 = Features(basis_list=basis)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(features2.size, ),
                              output_shape=(1, ))
    n_weights2 = approximator2.weights_size
    mu2 = np.zeros(n_weights2)
    sigma2 = 2e-0 * np.ones(n_weights2)
    pi2 = DeterministicPolicy(approximator2)
    dist2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 2
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        low=np.array([-np.pi, -np.pi, -np.pi]),
        high=np.array([np.pi, np.pi, np.pi]),
        shape=(3, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=30)

    agent2 = alg_low(distribution=dist2,
                     policy=pi2,
                     features=features2,
                     mdp_info=mdp_info_agent2,
                     eps=eps1)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(dist1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(dist2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=20,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, function_block3, function_block4,
        error_acc, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_input(function_block1)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    control_block2.add_input(function_block2)
    control_block2.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block1)
    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    error_acc.add_input(function_block3)
    error_acc.add_alarm_connection(control_block2)
    function_block4.add_input(function_block3)
    function_block4.add_input(error_acc)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))