Ejemplo n.º 1
0
def experiment(algorithm_class, exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialParameter(value=1,
                                   exp=.5,
                                   size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(mdp.info, pi, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.Q, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get()

    return reward, max_Qs
Ejemplo n.º 2
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(COPDAC_Q.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high, phi, phi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
Ejemplo n.º 3
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()
    print('============ start experiment ============')
    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = GraspEnv()
    print('============ mdp ============')

    # Policy
    n_weights = 6
    mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0])
    sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1,
                        0.1])  #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4])
    policy = Own_policy()
    dist = GaussianDiagonalDistribution(
        mu, sigma)  # TODO: is this distribution right? Yes.
    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset(
    )  # TODO: should we also collect the dataset? Just keep this.
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])
    #core = Core(agent, mdp)

    for i in range(n_epochs):
        print('================ core learn ================')
        core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit)

        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        print('J:', J)
        print('============================')
        dataset_callback.clean()  # Todo: learning curve? Done

        p = dist.get_parameters()
        print('p:', p)
        mu_0.append(p[:n_weights][0])
        mu_1.append(p[:n_weights][1])
        mu_2.append(p[:n_weights][2])
        mu_3.append(p[:n_weights][3])
        mu_4.append(p[:n_weights][4])
        mu_5.append(p[:n_weights][5])

        current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] +
                             p[n_weights:][2] + p[n_weights:][3] +
                             p[n_weights:][4] + p[n_weights:][5]) / 6
        avg_sigma.append(current_avg_sigma)

        # record learning curve of cumulative rewards
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])
        list_J.append(np.mean(J))
Ejemplo n.º 4
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])

    logger.info('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
Ejemplo n.º 5
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    print(alg.__name__)
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        print('mu:    ', p[:n_weights])
        print('sigma: ', p[n_weights:])
        print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J)))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
Ejemplo n.º 6
0
def experiment(policy, value):
    np.random.seed()

    # MDP
    mdp = generate_taxi('grid.txt')

    # Policy
    pi = policy(Parameter(value=value))

    # Agent
    learning_rate = Parameter(value=.15)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = SARSA(mdp.info, pi, **algorithm_params)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    n_steps = 300000
    core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)

    return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
Ejemplo n.º 7
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings-1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator, input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = StochasticAC_AVG(mdp.info, policy,
                             alpha_theta, alpha_v, alpha_r,
                             lambda_par=.5,
                             value_function_features=psi,
                             policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high,
                               phi, psi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps/n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
Ejemplo n.º 8
0
# Q-function approximator
n_tilings = 10
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

# Agent
learning_rate = Parameter(.1 / n_tilings)
approximator_params = dict(input_shape=(features.size, ),
                           output_shape=(mdp.info.action_space.n, ),
                           n_actions=mdp.info.action_space.n)
agent = SARSALambdaContinuous(mdp.info,
                              pi,
                              LinearApproximator,
                              approximator_params=approximator_params,
                              learning_rate=learning_rate,
                              lambda_coeff=.9,
                              features=features)

# Algorithm
collect_dataset = CollectDataset()
callbacks = [collect_dataset]
core = Core(agent, mdp, callbacks=callbacks)

# Train
core.learn(n_episodes=100, n_steps_per_fit=1)

# Evaluate
core.evaluate(n_episodes=1, render=True)