Esempio n. 1
0
    def __call__(self, dataset):
        super().__call__(dataset)

        for sample in dataset:
            obs = sample[0]
            action = sample[1]
            reward = sample[2]

            for i in range(len(action)):
                self.action_buffers_list[i].update([action[i]])

            for i in range(obs.size):
                self.observation_buffers_list[i].update([obs[i]])

            self.instant_reward_buffer.update([reward])

        lengths_of_episodes = episodes_length(self._data_list)

        start_index = 0
        for length_of_episode in lengths_of_episodes:
            sub_dataset = self._data_list[start_index:start_index +
                                          length_of_episode]

            episodic_reward = compute_J(sub_dataset)
            self.training_reward_buffer.update([episodic_reward[0]])
            self.episodic_len_buffer_training.update([length_of_episode])

            start_index = length_of_episode

        self._data_list = self._data_list[start_index:]

        self.plot_window.refresh()
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test):
    np.random.seed(1)

    # MDP
    gamma = 0.99
    horizon = 2000
    mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles)

    # Agent
    agent = create_SAC_agent(mdp)

    # normalization callback
    normalizer = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info)

    # Algorithm(with normalization and plotting)
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer])

    # training loop
    for n in range(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=True)
        print('Epoch: ', n, '  J: ', np.mean(compute_J(dataset, gamma)),
              '  Len_ep: ', int(np.round(np.mean(episodes_length(dataset)))))

    print('Press a button to visualize humanoid')
    input()
    core.evaluate(n_episodes=10, render=True)
Esempio n. 3
0
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test):
    np.random.seed(1)

    logger = Logger('SAC', results_dir=None)
    logger.strong_line()
    logger.info('Humanoid Experiment, Algorithm: SAC')

    # MDP
    gamma = 0.99
    horizon = 2000
    mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles)

    # Agent
    agent = create_SAC_agent(mdp)

    # normalization callback
    normalizer = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info)

    # Algorithm(with normalization and plotting)
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer])
    dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

    J = np.mean(compute_J(dataset, gamma))
    L = int(np.round(np.mean(episodes_length(dataset))))

    logger.epoch_info(0, J=J, episode_lenght=L)

    # training loop
    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

        J = np.mean(compute_J(dataset, gamma))
        L = int(np.round(np.mean(episodes_length(dataset))))


        logger.epoch_info(n+1, J=J, episode_lenght=L)

    logger.info('Press a button to visualize humanoid')
    input()
    core.evaluate(n_episodes=10, render=True)
Esempio n. 4
0
def get_stats(dataset, gamma):
    J = compute_J(dataset, gamma)
    abs_idxs = np.cumsum(episodes_length(dataset)) - 1
    S = list()
    for idx in abs_idxs:
        S.append(dataset[idx][3]['info']['is_success'])

    J = np.mean(J)
    S = np.mean(S)

    print('J: ', J)
    print('S: ', S)

    return J, S
Esempio n. 5
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))
Esempio n. 6
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(mdp.info,
                                  pi,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    #shape = agent.approximator.Q
    #print(agent.Q)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_steps_per_fit=1, render=True)
    dataset = core.evaluate(n_episodes=1, render=False)
    #print(dataset)
    print(episodes_length(dataset))

    return np.mean(compute_J(dataset, .96))