コード例 #1
0
def experiment():
    np.random.seed()

    # MDP
    mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(mdp.info, pi, **algorithm_params)

    # Core
    core = Core(agent, mdp)

    # Initial policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J start:', J)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)

    # Final Policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    print('J final:', J)
コード例 #2
0
def compute_metrics(core, eval_params, agent_builder, cmp_E):
    """
    Function to compute the metrics.

    Args:
        eval_params (dict): parameters for running the evaluation
        agent_builder (AgentBuilder): the agent builder
        cmp_E (bool): select if policy entropy should be computed
    """
    dataset = core.evaluate(**eval_params)

    # Compute J
    J = np.mean(compute_J(dataset, core.mdp.info.gamma))

    # Compute R
    R = np.mean(compute_J(dataset))

    # Compute Q
    states = get_init_states(dataset)
    Q = agent_builder.compute_Q(agent=core.agent, states=states)

    # Compute Policy Entropy
    E = None
    if cmp_E:
        if agent_builder.compute_entropy_with_states:
            E = core.agent.policy.entropy(parse_dataset(dataset)[0])
        else:
            E = core.agent.policy.entropy()

    return J, R, Q, E
コード例 #3
0
ファイル: lqr_bbo.py プロジェクト: adbmd/mushroom-rl
def experiment(alg, params, n_epochs, fit_per_run, ep_per_run):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 1e-3 * np.eye(policy.weights_size)
    distribution = GaussianCholeskyDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=fit_per_run * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        print('distribution parameters: ', distribution.get_parameters())
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
コード例 #4
0
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
               n_episodes_test, alg_params, policy_params):
    print(alg.__name__)

    mdp = Gym(env_id, horizon, gamma)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=32,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    alg_params['critic_params'] = critic_params

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy()

    tqdm.write('END OF EPOCH 0')
    tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E))
    tqdm.write(
        '##################################################################################################'
    )

    for it in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy()

        tqdm.write('END OF EPOCH ' + str(it + 1))
        tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E))
        tqdm.write(
            '##################################################################################################'
        )

    print('Press a button to visualize')
    input()
    core.evaluate(n_episodes=5, render=True)
コード例 #5
0
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
               n_step_test, alg_params, policy_params):
    logger = Logger(A2C.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + A2C.__name__)

    mdp = Gym(env_id, horizon, gamma)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 7e-4,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         n_features=64,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    alg_params['critic_params'] = critic_params

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    dataset = core.evaluate(n_steps=n_step_test, render=False)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy()

    logger.epoch_info(0, J=J, R=R, entropy=E)

    for it in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_steps=n_step_test, render=False)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy()

        logger.epoch_info(it + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize')
    input()
    core.evaluate(n_episodes=5, render=True)
コード例 #6
0
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, features=phi, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_fit)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0, J=np.mean(J))

    for i in range(n_epochs):
        core.learn(n_episodes=fit_per_epoch * ep_per_fit,
                   n_episodes_per_fit=ep_per_fit)
        dataset_eval = core.evaluate(n_episodes=ep_per_fit)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(i + 1, J=np.mean(J))
コード例 #7
0
def experiment(alg, params, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, features=phi, **params)

    # Train
    print(alg.__name__)
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
コード例 #8
0
    def __call__(self, dataset):
        super().__call__(dataset)

        for sample in dataset:
            obs = sample[0]
            action = sample[1]
            reward = sample[2]

            for i in range(len(action)):
                self.action_buffers_list[i].update([action[i]])

            for i in range(obs.size):
                self.observation_buffers_list[i].update([obs[i]])

            self.instant_reward_buffer.update([reward])

        lengths_of_episodes = episodes_length(self._data_list)

        start_index = 0
        for length_of_episode in lengths_of_episodes:
            sub_dataset = self._data_list[start_index:start_index +
                                          length_of_episode]

            episodic_reward = compute_J(sub_dataset)
            self.training_reward_buffer.update([episodic_reward[0]])
            self.episodic_len_buffer_training.update([length_of_episode])

            start_index = length_of_episode

        self._data_list = self._data_list[start_index:]

        self.plot_window.refresh()
コード例 #9
0
def learn(alg, alg_params):
    mdp = CarOnHill()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    agent = alg(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=5, n_episodes_per_fit=5)

    test_epsilon = Parameter(0.75)
    agent.policy.set_epsilon(test_epsilon)
    dataset = core.evaluate(n_episodes=2)

    return agent, np.mean(compute_J(dataset, mdp.info.gamma))
コード例 #10
0
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test):
    np.random.seed(1)

    # MDP
    gamma = 0.99
    horizon = 2000
    mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles)

    # Agent
    agent = create_SAC_agent(mdp)

    # normalization callback
    normalizer = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info)

    # Algorithm(with normalization and plotting)
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer])

    # training loop
    for n in range(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=True)
        print('Epoch: ', n, '  J: ', np.mean(compute_J(dataset, gamma)),
              '  Len_ep: ', int(np.round(np.mean(episodes_length(dataset)))))

    print('Press a button to visualize humanoid')
    input()
    core.evaluate(n_episodes=10, render=True)
コード例 #11
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(COPDAC_Q.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high, phi, phi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
コード例 #12
0
    def fit(self, dataset):
        Jep = compute_J(dataset, self.mdp_info.gamma)

        Jep = np.array(Jep)
        theta = np.array(self._theta_list)

        self._update(Jep, theta)

        self._theta_list = list()
コード例 #13
0
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk):
    np.random.seed()

    logger = Logger('plot_and_norm_example', results_dir=None)
    logger.strong_line()
    logger.info('Plotting and normalization example')

    # MDP
    mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    optimizer = AdaptiveOptimizer(eps=.01)
    algorithm_params = dict(optimizer=optimizer)
    agent = REINFORCE(mdp.info, policy, **algorithm_params)

    # normalization callback
    prepro = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info, obs_normalized=True)

    # Train
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro])

    # training loop
    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset = core.evaluate(n_episodes=ep_per_run, render=False)
        J = np.mean(compute_J(dataset, mdp.info.gamma))
        logger.epoch_info(n + 1, J=J)

    if save_states_to_disk:
        # save normalization / plot states to disk path
        logger.info('Saving plotting and normalization data')
        os.makedirs("./logs/plot_and_norm", exist_ok=True)
        prepro.save("./logs/plot_and_norm/preprocessor.msh")
        plotter.save_state("./logs/plot_and_norm/plotting_state")

        # load states from disk path
        logger.info('Loading preprocessor and plotter')
        prerpo = MinMaxPreprocessor.load(
            "./logs/plot_and_norm/preprocessor.msh")
        plotter.load_state("./logs/plot_and_norm/plotting_state")
コード例 #14
0
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test):
    np.random.seed(1)

    logger = Logger('SAC', results_dir=None)
    logger.strong_line()
    logger.info('Humanoid Experiment, Algorithm: SAC')

    # MDP
    gamma = 0.99
    horizon = 2000
    mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles)

    # Agent
    agent = create_SAC_agent(mdp)

    # normalization callback
    normalizer = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info)

    # Algorithm(with normalization and plotting)
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer])
    dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

    J = np.mean(compute_J(dataset, gamma))
    L = int(np.round(np.mean(episodes_length(dataset))))

    logger.epoch_info(0, J=J, episode_lenght=L)

    # training loop
    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

        J = np.mean(compute_J(dataset, gamma))
        L = int(np.round(np.mean(episodes_length(dataset))))


        logger.epoch_info(n+1, J=J, episode_lenght=L)

    logger.info('Press a button to visualize humanoid')
    input()
    core.evaluate(n_episodes=10, render=True)
コード例 #15
0
ファイル: lqr_pg.py プロジェクト: PuzeLiu/mushroom-rl
def experiment(alg, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 0.25 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    optimizer = AdaptiveOptimizer(eps=1e-2)
    algorithm_params = dict(optimizer=optimizer)
    agent = alg(mdp.info, policy, **algorithm_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0,
                      J=np.mean(J),
                      policy_weights=policy.get_weights().tolist())

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          policy_weights=policy.get_weights().tolist())
コード例 #16
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()
    print('============ start experiment ============')
    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = GraspEnv()
    print('============ mdp ============')

    # Policy
    n_weights = 6
    mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0])
    sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1,
                        0.1])  #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4])
    policy = Own_policy()
    dist = GaussianDiagonalDistribution(
        mu, sigma)  # TODO: is this distribution right? Yes.
    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset(
    )  # TODO: should we also collect the dataset? Just keep this.
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])
    #core = Core(agent, mdp)

    for i in range(n_epochs):
        print('================ core learn ================')
        core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit)

        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        print('J:', J)
        print('============================')
        dataset_callback.clean()  # Todo: learning curve? Done

        p = dist.get_parameters()
        print('p:', p)
        mu_0.append(p[:n_weights][0])
        mu_1.append(p[:n_weights][1])
        mu_2.append(p[:n_weights][2])
        mu_3.append(p[:n_weights][3])
        mu_4.append(p[:n_weights][4])
        mu_5.append(p[:n_weights][5])

        current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] +
                             p[n_weights:][2] + p[n_weights:][3] +
                             p[n_weights:][4] + p[n_weights:][5]) / 6
        avg_sigma.append(current_avg_sigma)

        # record learning curve of cumulative rewards
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])
        list_J.append(np.mean(J))
コード例 #17
0
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 1e-3 * np.eye(policy.weights_size)
    distribution = GaussianCholeskyDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_fit)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0,
                      J=np.mean(J),
                      distribution_parameters=distribution.get_parameters())

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=fit_per_epoch * ep_per_fit,
                   n_episodes_per_fit=ep_per_fit)
        dataset_eval = core.evaluate(n_episodes=ep_per_fit)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(
            i + 1,
            J=np.mean(J),
            distribution_parameters=distribution.get_parameters())
コード例 #18
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CarOnHill()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    algorithm_params = dict(n_iterations=20)
    agent = FQI(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Render
    core.evaluate(n_episodes=1, render=True)

    # Train
    core.learn(n_episodes=1000, n_episodes_per_fit=1000)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.zeros((289, 2))
    cont = 0
    for i in range(-8, 9):
        for j in range(-8, 9):
            initial_states[cont, :] = [0.125 * i, 0.375 * j]
            cont += 1

    dataset = core.evaluate(initial_states=initial_states)

    # Render
    core.evaluate(n_episodes=3, render=True)

    return np.mean(compute_J(dataset, mdp.info.gamma))
コード例 #19
0
ファイル: lqr_pg.py プロジェクト: yanxg/mushroom-rl
def experiment(alg, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = alg(mdp.info, policy, **algorithm_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    print('policy parameters: ', policy.get_weights())
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        print('policy parameters: ', policy.get_weights())
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
コード例 #20
0
def experiment():
    np.random.seed()

    logger = Logger(QLearning.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + QLearning.__name__)

    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    # Policy
    epsilon = Parameter(value=.15)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(value=.2)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = QLearning(mdp.info, pi, **algorithm_params)

    # Core
    core = Core(agent, mdp)

    # Initial policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    logger.info(f'J start: {J}')

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1)

    # Final Policy Evaluation
    dataset = core.evaluate(n_steps=1000)
    J = np.mean(compute_J(dataset, mdp.info.gamma))
    logger.info(f'J final: {J}')
コード例 #21
0
def get_stats(dataset, gamma):
    J = compute_J(dataset, gamma)
    abs_idxs = np.cumsum(episodes_length(dataset)) - 1
    S = list()
    for idx in abs_idxs:
        S.append(dataset[idx][3]['info']['is_success'])

    J = np.mean(J)
    S = np.mean(S)

    print('J: ', J)
    print('S: ', S)

    return J, S
コード例 #22
0
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = REINFORCE(mdp.info, policy, **algorithm_params)

    # normalization callback
    prepro = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info, obs_normalized=True)

    # Train
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro])

    # training loop
    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset = core.evaluate(n_episodes=ep_per_run, render=False)
        print('Epoch: ', n, '  J: ', np.mean(compute_J(dataset,
                                                       mdp.info.gamma)))

    if save_states_to_disk:
        # save normalization / plot states to disk path
        os.makedirs("./temp/", exist_ok=True)
        prepro.save_state("./temp/normalization_state")
        plotter.save_state("./temp/plotting_state")

        # load states from disk path
        prepro.load_state("./temp/normalization_state")
        plotter.load_state("./temp/plotting_state")
コード例 #23
0
ファイル: RL_Learn.py プロジェクト: gowun/RL_for_reco
 def compare_model_with_origin(self, initial_states, compared_rewards, n_samples=10000):
     if len(initial_states) > n_samples:
         idx = np.random.choice(range(len(initial_states)), n_samples, replace=False)
         samples = initial_states[idx]
         if compared_rewards is not None:
             raw_r = np.mean(np.array(compared_rewards)[idx])
     else:
         samples = initial_states
         if compared_rewards is not None:
             raw_r = np.mean(compared_rewards)
     dataset = self.core.evaluate(initial_states=samples)
     J = compute_J(dataset, 1.0)
     learned_r = np.mean(J)/self.env.horizon
     return learned_r, raw_r, learned_r - raw_r
コード例 #24
0
    def _print_fit_info(self, dataset, x, v_target, old_pol_dist):
        if not self._quiet:
            logging_verr = []
            torch_v_targets = torch.tensor(v_target, dtype=torch.float)
            for idx in range(len(self._V)):
                v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float)
                v_err = F.mse_loss(v_pred, torch_v_targets)
                logging_verr.append(v_err.item())

            logging_ent = self.policy.entropy(x)
            new_pol_dist = self.policy.distribution(x)
            logging_kl = torch.mean(torch.distributions.kl.kl_divergence(
                new_pol_dist, old_pol_dist))
            avg_rwd = np.mean(compute_J(dataset))
            tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {}  kl {}".format(
                avg_rwd, logging_verr, logging_ent, logging_kl))
            tqdm.write(
                '--------------------------------------------------------------------------------------------------')
コード例 #25
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])

    logger.info('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
コード例 #26
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(mdp.info,
                                  pi,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    #shape = agent.approximator.Q
    #print(agent.Q)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_steps_per_fit=1, render=True)
    dataset = core.evaluate(n_episodes=1, render=False)
    #print(dataset)
    print(episodes_length(dataset))

    return np.mean(compute_J(dataset, .96))
コード例 #27
0
ファイル: trpo.py プロジェクト: PuzeLiu/mushroom-rl
    def _log_info(self, dataset, x, v_target, old_pol_dist):
        if self._logger:
            logging_verr = []
            torch_v_targets = torch.tensor(v_target, dtype=torch.float)
            for idx in range(len(self._V)):
                v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float)
                v_err = F.mse_loss(v_pred, torch_v_targets)
                logging_verr.append(v_err.item())

            logging_ent = self.policy.entropy(x)
            new_pol_dist = self.policy.distribution(x)
            logging_kl = torch.mean(
                torch.distributions.kl.kl_divergence(old_pol_dist,
                                                     new_pol_dist))
            avg_rwd = np.mean(compute_J(dataset))
            msg = "Iteration {}:\n\t\t\t\trewards {} vf_loss {}\n\t\t\t\tentropy {}  kl {}".format(
                self._iter, avg_rwd, logging_verr, logging_ent, logging_kl)

            self._logger.info(msg)
            self._logger.weak_line()
コード例 #28
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    print(alg.__name__)
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        print('mu:    ', p[:n_weights])
        print('sigma: ', p[n_weights:])
        print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J)))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
コード例 #29
0
def experiment(alg, n_epochs, n_steps, n_episodes_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    gamma = 0.99
    habitat_root_path = Habitat.root_path()
    config_file = os.path.join(
        habitat_root_path, 'habitat_baselines/config/rearrange/rl_pick.yaml')
    base_config_file = os.path.join(habitat_root_path,
                                    'configs/tasks/rearrange/pick.yaml')
    wrapper = 'HabitatRearrangeWrapper'
    mdp = Habitat(wrapper, config_file, base_config_file, gamma=gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = actor_input_shape + mdp.info.action_space.shape
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    # RUN
    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
    s, *_ = parse_dataset(dataset)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy(s)

    logger.epoch_info(0, J=J, R=R, entropy=E)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
        s, *_ = parse_dataset(dataset)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy(s)

        logger.epoch_info(n + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize the robot')
    input()
    core.evaluate(n_episodes=5, render=True)
コード例 #30
0
def experiment(alg, n_epochs, n_steps, n_steps_test):
    np.random.seed()

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v0', horizon, gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    # RUN
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma)
    print('J: ', np.mean(J))

    for n in range(n_epochs):
        print('Epoch: ', n)
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma)
        print('J: ', np.mean(J))

    print('Press a button to visualize pendulum')
    input()
    core.evaluate(n_episodes=5, render=True)