Exemple #1
0
def test_PGPE():
    distribution = learn(PGPE, learning_rate=AdaptiveParameter(1.5))
    w = distribution.get_parameters()
    w_test = np.array([
        0.02489092, 0.31062211, 0.2051433, 0.05959651, -0.78302236, 0.77381954,
        0.23676176, -0.29855654
    ])

    assert np.allclose(w, w_test)
def experiment(alg, n_runs, n_iterations, ep_per_run, use_tensorflow):
    np.random.seed()

    # MDP
    mdp = ShipSteering()

    # Policy
    if use_tensorflow:
        tensor_list = gaussian_tensor.generate(
            [3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi],
                           [-np.pi / 12, np.pi / 12]])

        phi = Features(tensor_list=tensor_list,
                       name='phi',
                       input_dim=mdp.info.observation_space.shape[0])
    else:
        basis = GaussianRBF.generate([3, 3, 6, 2],
                                     [[0., 150.], [0., 150.], [-np.pi, np.pi],
                                      [-np.pi / 12, np.pi / 12]])

        phi = Features(basis_list=basis)

    input_shape = (phi.size, )

    approximator_params = dict(input_dim=phi.size)
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = np.array([[.05]])
    policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = alg(policy, mdp.info, agent_params, phi)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in xrange(n_runs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))

    np.save('ship_steering.npy', dataset_eval)
Exemple #3
0
def test_PGPE():
    distribution = GaussianDiagonalDistribution(mu, sigma)
    agent = PGPE(distribution, policy, mdp.info,
                 learning_rate=AdaptiveParameter(1.5), features=phi)

    agent.episode_start()

    agent.fit(dataset)

    w_1 = .54454343
    w_2 = .5449792

    w = agent.policy.get_weights()

    assert np.allclose(w_1, w[10])
    assert np.allclose(w_2, w[18])
def experiment(alg, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = alg(policy, mdp.info, **algorithm_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    print('policy parameters: ', policy.get_weights())
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        print('policy parameters: ', policy.get_weights())
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
Exemple #5
0
def experiment(alg, n_runs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = .1 * np.eye(1)
    policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = alg(policy, mdp.info, agent_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    print 'policy parameters: ', policy.get_weights()
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in xrange(n_runs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        print 'policy parameters: ', policy.get_weights()
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))

    np.save('ship_steering.npy', dataset_eval)
Exemple #6
0
        ep_per_run_hier = ep_per_epoch // n_iterations_hier

        print('High: ', alg_h.__name__, ' Low: ', alg_l.__name__)
        res = Parallel(n_jobs=n_jobs)(delayed(hierarchical_experiment)
                                      (mdp, agent_l, agent_h,
                                       n_epochs, n_iterations_hier,
                                       ep_per_run_hier, ep_per_eval,
                                       ep_per_run_low)
                                      for _ in range(how_many))
        J, L = parse_joblib(res)
        np.save(subdir + '/J_H_' + alg_h.__name__ + '_' + alg_l.__name__, J)
        np.save(subdir + '/L_H_' + alg_h.__name__ + '_' + alg_l.__name__, L)'''

    # GHAVAMZADEH
    params_high = {'learning_rate': Parameter(value=8e-2), 'lambda_coeff': 0.9}
    agent_high = build_high_level_ghavamzadeh(QLambdaDiscrete, params_high,
                                              mdp)

    params_low = {'learning_rate': AdaptiveParameter(value=1e-2)}
    agent_cross = build_low_level_ghavamzadeh(GPOMDP, params_low, mdp)
    agent_plus = build_low_level_ghavamzadeh(GPOMDP, params_low, mdp)

    print('ghavamzadeh')
    res = Parallel(n_jobs=n_jobs)(delayed(ghavamzadeh_experiment)(
        mdp, agent_plus, agent_cross, agent_high, n_epochs, ep_per_epoch,
        ep_per_eval, ep_per_run_low_ghavamzadeh) for _ in range(how_many))
    J, L = parse_joblib(res)
    np.save(subdir + '/J_ghavamzadeh', J)
    np.save(subdir + '/L_ghavamzadeh', L)
               name='phi',
               input_dim=mdp.info.observation_space.shape[0])

input_shape = (phi.size, )

approximator_params = dict(input_dim=phi.size)
approximator = Regressor(LinearApproximator,
                         input_shape=input_shape,
                         output_shape=mdp.info.action_space.shape,
                         params=approximator_params)

sigma = np.eye(2) * 1e-1
policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma)

# Agent
learning_rate = AdaptiveParameter(value=5)
algorithm_params = dict(learning_rate=learning_rate)
fit_params = dict()
agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params}
agent = REINFORCE(policy, mdp.info, agent_params, phi)

# Train
core = Core(agent, mdp)
print 'Initial evaluation'
dataset_eval = core.evaluate(n_episodes=ep_per_run)
J = compute_J(dataset_eval, gamma=mdp.info.gamma)
print('J at start : ' + str(np.mean(J)))

for i in xrange(n_runs):
    print 'iteration', i
    print 'learn'
Exemple #8
0
    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset_file',
            low_level_dataset_eval)
    np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1)
    np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)


if __name__ == '__main__':

    subdir = datetime.datetime.now().strftime(
        '%Y-%m-%d_%H-%M-%S') + '_big_hierarchical/'
    alg_high = GPOMDP
    alg_low = PGPE
    learning_rate_high = AdaptiveParameter(value=50)
    learning_rate_low = AdaptiveParameter(value=5e-4)
    n_jobs = -1
    how_many = 1
    n_runs = 25
    n_iterations = 20
    ep_per_run = 40
    eval_run = 50
    mk_dir_recursive('./' + subdir)
    force_symlink('./' + subdir, 'latest')

    params = {
        'learning_rate_high': learning_rate_high,
        'learning_rate_low': learning_rate_low
    }
    np.save(subdir + '/algorithm_params_dictionary', params)
Exemple #9
0
        print('ITERATION', n)

        if n == 2:
            control_block1.unset_mask()
        core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        print('dist H:', dist1.get_parameters())
        print('dist L mu:', dist2.get_parameters()[:3])
        print('dist L sigma:', dist2.get_parameters()[3:])


if __name__ == '__main__':
    learning_rate_high = Parameter(value=1e-5)
    learning_rate_low = AdaptiveParameter(value=1e-1)
    eps_high = 0.05
    eps_low = 0.05
    beta_high = 0.01
    beta_low = 2e-3

    algs_params = [
            #(REPS, REPS, {'eps': eps_high}, {'eps': eps_low}),
            (RWR, RWR, {'beta': beta_high}, {'beta': beta_low}),
            #(PGPE, PGPE, {'learning_rate': learning_rate_high},
             #{'learning_rate': learning_rate_low}),
        #(PGPE, RWR, {'learning_rate' : learning_rate_high}, {'beta' :
        # beta_low})
        ]

    n_jobs = 1
Exemple #10
0
def experiment():
    np.random.seed()

    # Model Block
    mdp = ShipSteeringMultiGate()

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)', phi=phi)

    # Function Block 2
    function_block2 = squarednormBlock(name='f2 (squared norm)')

    # Function Block 3
    function_block3 = addBlock(name='f3 (summation)')

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([38, 38])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([75, 75]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1)

    # Policy 2
    sigma2 = Parameter(value=.01)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(1, ),
                              output_shape=mdp.info.action_space.shape)
    pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2)

    # Agent 1
    learning_rate = AdaptiveParameter(value=10)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, 150, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=50)
    agent1 = GPOMDP(policy=pi1,
                    mdp_info=mdp_info_agent1,
                    params=agent_params,
                    features=features)

    # Agent 2
    learning_rate = AdaptiveParameter(value=.001)
    algorithm_params = dict(learning_rate=learning_rate)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = GPOMDP(policy=pi2,
                    mdp_info=mdp_info_agent2,
                    params=agent_params,
                    features=None)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=5,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    dataset_callback = CollectDataset()
    parameter_callback2 = CollectPolicyParameter(pi2)
    control_block2 = ControlBlock(
        name='Control Block 2',
        agent=agent2,
        n_eps_per_fit=10,
        callbacks=[dataset_callback, parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, control_block1, control_block2, function_block1,
        function_block2, function_block3, reward_acc
    ]
    #order = [0, 1, 7, 2, 4, 5, 6, 3]
    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)
    function_block3.add_input(function_block2)
    function_block3.add_input(reward_ph)
    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block3)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    #dataset_learn_visual = core.learn(n_episodes=2000)
    dataset_learn_visual = list()
    for n in range(4):
        dataset_learn = core.learn(n_episodes=500)
        last_ep_dataset = pick_last_ep(dataset_learn)
        dataset_learn_visual += last_ep_dataset
        del dataset_learn

    # Evaluate
    dataset_eval = core.evaluate(n_episodes=10)

    # Visualize
    low_level_dataset = dataset_callback.get()
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()
    visualize_policy_params(parameter_dataset1, parameter_dataset2)
    visualize_control_block(low_level_dataset, ep_count=20)
    visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4)

    visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4)
    plt.show()

    return
Exemple #11
0
if __name__ == '__main__':
    how_many = 100
    n_jobs = -1
    n_runs = 25
    n_iterations = 10
    ep_per_run = 20

    algs_and_params = [
        (REPS, {
            'eps': 1.0
        }),
        (RWR, {
            'beta': 0.7
        }),
        (PGPE, {
            'learning_rate': AdaptiveParameter(value=1.5)
        }),
    ]

    base_dir = datetime.datetime.now().strftime(
        '%Y-%m-%d_%H-%M-%S') + '_small_bbo/'
    mk_dir_recursive('./' + base_dir)
    force_symlink(base_dir, './latest')

    for alg, params in algs_and_params:
        subdir = base_dir + alg.__name__ + '/'
        mk_dir_recursive('./' + subdir)

        np.save(subdir + '/algorithm_params_dictionary', params)
        experiment_params = {
            'how_many': how_many,
Exemple #12
0
def test_GPOMDP():
    params = dict(learning_rate=AdaptiveParameter(value=.01))
    policy = learn(GPOMDP, params)
    w = np.array([-0.07623939, 2.05232858])

    assert np.allclose(w, policy.get_weights())
        print(alg.__name__)
        res = Parallel(n_jobs=n_jobs)(delayed(flat_experiment)(mdp,
                                                               agent,
                                                               n_epochs,
                                                               n_iterations_bbo,
                                                               ep_per_run_bbo,
                                                               ep_per_eval)
                                      for _ in range(how_many))
        J, L = parse_joblib(res)
        np.save(subdir + '/J_' + alg.__name__, J)
        np.save(subdir + '/L_' + alg.__name__, L)'''

    # HIERARCHICAL
    algs_and_params_hier = [
        (GPOMDP, {'learning_rate': AdaptiveParameter(value=10)},
         PGPE, {'learning_rate': AdaptiveParameter(value=5e-4)})
    ]

    mu = np.array([75, 75])
    sigma = np.array([40, 40])

    for alg_h, params_h, alg_l, params_l in algs_and_params_hier:
        agent_h = build_high_level_agent(alg_h, params_h, mdp, mu, sigma)
        agent_l = build_low_level_agent(alg_l, params_l, mdp)

        ep_per_run_hier = ep_per_epoch // n_iterations_hier

        print('High: ', alg_h.__name__, ' Low: ', alg_l.__name__)
        res = Parallel(n_jobs=n_jobs)(delayed(hierarchical_experiment)
                                      (mdp, agent_l, agent_h,
Exemple #14
0
def test_REINFORCE():
    params = dict(learning_rate=AdaptiveParameter(value=.01))
    policy = learn(REINFORCE, params)
    w = np.array([-0.0084793, 2.00536528])

    assert np.allclose(w, policy.get_weights())
Exemple #15
0
    # MDP
    mdp = ShipSteeringMultiGate(n_steps_action=3, small=True)

    # directory
    name = 'multigate_ship_steering'
    subdir = name + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')\
             + '/'

    mk_dir_recursive('./' + subdir)
    force_symlink('./' + subdir, name + '_latest')

    # Hierarchical
    algs_and_params_hier = [
        (GPOMDP, {'learning_rate': Parameter(value=1e-5)},
        PGPE, {'learning_rate': AdaptiveParameter(value=5e-4)})
         ]

    for alg_h, params_h, alg_l, params_l in algs_and_params_hier:

        mu = 0
        sigma = 0.15

        agent_h = build_high_level_agent(alg_h, params_h, mdp, mu, sigma)


        agent_l = build_low_level_agent(alg_l, params_l, mdp)

        ep_per_run_hier = ep_per_epoch_train // n_iterations

        print('High: ', alg_h.__name__, ' Low: ', alg_l.__name__)
Exemple #16
0
    mdp = ShipSteeringMultiGate(n_steps_action=3, small=True)

    # directory
    name = 'multigate_ship_steering'
    subdir = name + '_' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')\
             + '/'

    mk_dir_recursive('./' + subdir)
    force_symlink('./' + subdir, name + '_latest')

    # Hierarchical
    algs_and_params_hier = [(QLearning, {
        'learning_rate': Parameter(value=0.6)
    }, GPOMDP, {
        'learning_rate':
        AdaptiveParameter(value=25) if mdp.small else AdaptiveParameter(
            value=50)
    }, PGPE, {
        'learning_rate': AdaptiveParameter(value=5e-4)
    })]

    for alg_h, params_h, alg_m, params_m, alg_l, params_l in algs_and_params_hier:

        epsilon = Parameter(value=1)
        agent_h = build_high_level_agent(alg_h, params_h, mdp, epsilon)

        mu = 250 if mdp.small else 500
        sigma = 125 if mdp.small else 250
        agent_m1 = build_mid_level_agent(alg_m, params_m, mdp, mu, sigma)
        agent_m2 = build_mid_level_agent(alg_m, params_m, mdp, mu, sigma)
        agent_m3 = build_mid_level_agent(alg_m, params_m, mdp, mu, sigma)
Exemple #17
0
def test_eNAC():
    params = dict(learning_rate=AdaptiveParameter(value=.01))
    policy = learn(eNAC, params)
    w = np.array([-0.03668018, 2.05112355])

    assert np.allclose(w, policy.get_weights())