'mu': mu_hat }.items(): plt.pcolormesh(p, s, zz.T, cmap='coolwarm_r', vmin=-1, vmax=1, rasterized=True) plt.xlabel(r'$p$') plt.ylabel(r'$s$') if 'q' in key: plt.colorbar() plt.savefig( f'4_{generator.__name__}_{stop}_{method.__name__}_{key}.pdf' ) plt.close() ### Compute J^mû_N' N_prime = math.ceil(math.log((eps / B_r), gamma)) trajectories = samples(policify(mu_hat), N_prime) j_hat = expected_return(trajectories, N_prime) print('J^mû_N =', j_hat) print()
def main( render: bool = False, eps: float = 1.0, n_actions: int = 11, n_estimators: int = 20 ): # Environment gym.logger.set_level(40) env = gym.make('InvertedDoublePendulumPyBulletEnv-v0') # Rendering if render: env.render() env.reset() # /!\ Only work in our modified version of PyBullet Gym env.camera.env._p.resetDebugVisualizerCamera(2, 0, -20, [0, 0, 0]) # Setup gamma = 0.95 max_steps = 1000 n_evaluate = 20 # number of episodes to evaluate model B_r = 10 # maximum possible reward N = math.ceil(math.log((eps / (2 * B_r)) * (1. - gamma) ** 2, gamma)) print(f'N = {N}') rewards = [] # Discrete actions actions = np.linspace( env.action_space.low[0], env.action_space.high[0], n_actions ) # Agent agent = FQI( U=actions, U_dim=env.action_space.shape[0], gamma=gamma, n_estimators=n_estimators ) agent.fill(env, 3000, max_steps) # generate 3000 transitions to start # Training for _ in tqdm(range(N)): # Train the model agent.optimize() # Evaluate evals = [] for _ in range(n_evaluate): x = env.reset() cr = 0 # cumulative reward for step in range(max_steps): u = agent.action(x) x_prime, r, done, _ = env.step(u) # Store the transition in agent's transitions list agent.memory.append((x, u[0], r, x_prime, done)) x = x_prime if done: break cr += (gamma ** step) * r evals.append(cr) print(f'Memory size: {len(agent.memory)}') rewards.append(evals) # Export results rewards = np.array(rewards) mean = np.mean(rewards, axis=1) std = np.std(rewards, axis=1) plt.plot(mean) plt.fill_between( range(N), mean - std, mean + std, alpha=0.3 ) plt.xlabel('N') plt.ylabel(r'$J^{\mu}$') plt.savefig(f'fqi_J_{n_actions}_{n_estimators}.pdf') plt.close()
def main(render: bool = False, n_episodes: int = 500, discrete: int = None, n_layers: int = 1, gamma: float = 0.95, activation_id: str = 'relu'): # Environment gym.logger.set_level(40) env = gym.make('InvertedDoublePendulumPyBulletEnv-v0') # Rendering if render: env.render() env.reset() # /!\ Only work in our modified version of PyBullet Gym env.camera.env._p.resetDebugVisualizerCamera(2, 0, -20, [0, 0, 0]) # Setup max_steps = 1000 n_evaluate = 50 # number of episodes to evaluate model rewards = [] # Agent activations = {'relu': nn.ReLU, 'elu': nn.ELU} activation = activations.get(activation_id) agent = DDPG(env, gamma=gamma, discrete=discrete, n_layers=n_layers, activation=activation) noise = OrnsteinUhlenbeck(env.action_space) # Training for _ in tqdm(range(n_episodes)): x = env.reset() noise.reset() # Simulate the episode until terminal state or max number of steps for step in range(max_steps): u = agent.action(x) if discrete is None: u = noise.action(u) x_prime, r, done, _ = env.step(u) # Save transition agent.memory.push((torch.tensor(x).float(), u[0], r, torch.tensor(x_prime).float(), done)) # Optimization if agent.memory.is_ready(): agent.optimize() x = x_prime # If terminal state, stop the episode if done: break # Evaluation evals = [] for _ in range(n_evaluate): x = env.reset() cr = 0 # cumulative reward for step in range(max_steps): u = agent.action(x) x, r, done, _ = env.step(u) if done: break cr += (gamma**step) * r evals.append(cr) rewards.append(evals) # Export results rewards = np.array(rewards) mean = np.mean(rewards, axis=1) std = np.std(rewards, axis=1) plt.plot(mean) plt.fill_between(range(n_episodes), mean - std, mean + std, alpha=0.3) plt.xlabel('Episode') plt.ylabel(r'$J^{\mu}$') plt.savefig(f'ddpg_J_{discrete}_{n_layers}_{gamma}.pdf') plt.close()