Beispiel #1
0
def main(
    render: bool = False,
    eps: float = 1.0,
    n_actions: int = 11,
    n_estimators: int = 20
):
    # Environment

    gym.logger.set_level(40)
    env = gym.make('InvertedDoublePendulumPyBulletEnv-v0')

    # Rendering

    if render:
        env.render()
        env.reset()

        # /!\ Only work in our modified version of PyBullet Gym
        env.camera.env._p.resetDebugVisualizerCamera(2, 0, -20, [0, 0, 0])

    # Setup

    gamma = 0.95
    max_steps = 1000
    n_evaluate = 20  # number of episodes to evaluate model

    B_r = 10  # maximum possible reward
    N = math.ceil(math.log((eps / (2 * B_r)) * (1. - gamma) ** 2, gamma))

    print(f'N = {N}')

    rewards = []

    # Discrete actions

    actions = np.linspace(
        env.action_space.low[0],
        env.action_space.high[0],
        n_actions
    )

    # Agent

    agent = FQI(
        U=actions,
        U_dim=env.action_space.shape[0],
        gamma=gamma,
        n_estimators=n_estimators
    )
    agent.fill(env, 3000, max_steps)  # generate 3000 transitions to start

    # Training

    for _ in tqdm(range(N)):
        # Train the model

        agent.optimize()

        # Evaluate

        evals = []

        for _ in range(n_evaluate):
            x = env.reset()
            cr = 0  # cumulative reward

            for step in range(max_steps):
                u = agent.action(x)
                x_prime, r, done, _ = env.step(u)

                # Store the transition in agent's transitions list
                agent.memory.append((x, u[0], r, x_prime, done))

                x = x_prime

                if done:
                    break

                cr += (gamma ** step) * r

            evals.append(cr)

        print(f'Memory size: {len(agent.memory)}')

        rewards.append(evals)

    # Export results

    rewards = np.array(rewards)

    mean = np.mean(rewards, axis=1)
    std = np.std(rewards, axis=1)

    plt.plot(mean)
    plt.fill_between(
        range(N),
        mean - std,
        mean + std,
        alpha=0.3
    )

    plt.xlabel('N')
    plt.ylabel(r'$J^{\mu}$')

    plt.savefig(f'fqi_J_{n_actions}_{n_estimators}.pdf')
    plt.close()
Beispiel #2
0
                for key, zz in {
                        'q_-4': qq[..., 0],
                        'q_+4': qq[..., 1],
                        'mu': mu_hat
                }.items():

                    plt.pcolormesh(p,
                                   s,
                                   zz.T,
                                   cmap='coolwarm_r',
                                   vmin=-1,
                                   vmax=1,
                                   rasterized=True)
                    plt.xlabel(r'$p$')
                    plt.ylabel(r'$s$')

                    if 'q' in key:
                        plt.colorbar()

                    plt.savefig(
                        f'4_{generator.__name__}_{stop}_{method.__name__}_{key}.pdf'
                    )
                    plt.close()

                ### Compute J^mû_N'

                N_prime = math.ceil(math.log((eps / B_r), gamma))

                trajectories = samples(policify(mu_hat), N_prime)
                j_hat = expected_return(trajectories, N_prime)
Beispiel #3
0
def main(render: bool = False,
         n_episodes: int = 500,
         discrete: int = None,
         n_layers: int = 1,
         gamma: float = 0.95,
         activation_id: str = 'relu'):
    # Environment

    gym.logger.set_level(40)
    env = gym.make('InvertedDoublePendulumPyBulletEnv-v0')

    # Rendering

    if render:
        env.render()
        env.reset()

        # /!\ Only work in our modified version of PyBullet Gym
        env.camera.env._p.resetDebugVisualizerCamera(2, 0, -20, [0, 0, 0])

    # Setup

    max_steps = 1000
    n_evaluate = 50  # number of episodes to evaluate model

    rewards = []

    # Agent

    activations = {'relu': nn.ReLU, 'elu': nn.ELU}

    activation = activations.get(activation_id)

    agent = DDPG(env,
                 gamma=gamma,
                 discrete=discrete,
                 n_layers=n_layers,
                 activation=activation)
    noise = OrnsteinUhlenbeck(env.action_space)

    # Training

    for _ in tqdm(range(n_episodes)):
        x = env.reset()
        noise.reset()

        # Simulate the episode until terminal state or max number of steps

        for step in range(max_steps):
            u = agent.action(x)

            if discrete is None:
                u = noise.action(u)

            x_prime, r, done, _ = env.step(u)

            # Save transition

            agent.memory.push((torch.tensor(x).float(), u[0], r,
                               torch.tensor(x_prime).float(), done))

            # Optimization

            if agent.memory.is_ready():
                agent.optimize()

            x = x_prime

            # If terminal state, stop the episode

            if done:
                break

        # Evaluation

        evals = []

        for _ in range(n_evaluate):
            x = env.reset()
            cr = 0  # cumulative reward

            for step in range(max_steps):
                u = agent.action(x)
                x, r, done, _ = env.step(u)

                if done:
                    break

                cr += (gamma**step) * r

            evals.append(cr)

        rewards.append(evals)

    # Export results

    rewards = np.array(rewards)

    mean = np.mean(rewards, axis=1)
    std = np.std(rewards, axis=1)

    plt.plot(mean)
    plt.fill_between(range(n_episodes), mean - std, mean + std, alpha=0.3)

    plt.xlabel('Episode')
    plt.ylabel(r'$J^{\mu}$')

    plt.savefig(f'ddpg_J_{discrete}_{n_layers}_{gamma}.pdf')
    plt.close()
Beispiel #4
0
# 2.b Apply

if __name__ == '__main__':
    from plots import plt

    ## Choose N

    N = math.ceil(math.log((eps / B_r), gamma))

    print('N =', N)

    ## Compute J^mu_N

    ### Simulate 50 trajectories

    trajectories = samples(stepback, N, 50)

    ### Plot

    N = list(range(1, N))
    J = []

    for n in N:
        J.append(expected_return(trajectories, n))

    plt.plot(N, J)
    plt.xlabel(r'$N$')
    plt.ylabel(r'$J^\mu_N$')
    plt.savefig('2_expected_return.pdf')
    plt.close()
Beispiel #5
0
            qq = qq.reshape(gridshape)
            mu_hat = 2 * qq.argmax(axis=-1) - 1

            if n == transitions[-1]:
                plts = {'q0': qq[..., 0], 'q1': qq[..., 1], 'mu': mu_hat}

                for name, values in plts.items():
                    plt.pcolormesh(p,
                                   s,
                                   values.T,
                                   cmap='coolwarm_r',
                                   vmin=-1,
                                   vmax=1,
                                   rasterized=True)
                    plt.xlabel(r'$p$')
                    plt.ylabel(r'$s$')

                    if 'q' in name:
                        plt.colorbar()

                    plt.savefig(f'5_{name}_{key}.pdf')
                    plt.close()

            ### Compute J^mû_N'

            trajectories = samples(policify(mu_hat), N_prime)
            j_hat = expected_return(trajectories, N_prime)

            print('J^mû_N =', j_hat)

            js[key].append(j_hat)