def fill_buffer(): agent = RandomAgent(m) # Reset enviroment data while len(buffer) < L: done = False state = env.reset() while not done: action = agent.forward(state) next_state, reward, done, _ = env.step(action) buffer.append((state, action, reward, next_state, done)) state = next_state print('Buffer filled!')
def run_sim(agent=None): env = gym.make('LunarLanderContinuous-v2') env.reset() # Parameters N_episodes = 50 # Number of episodes n_ep_running_average = 50 # Running average of 50 episodes n_actions = len(env.action_space.high) # Action dimension dim_state = len(env.observation_space.high) # State dimensionality # We will use these variables to compute the average episodic reward and # the average number of steps per episode episode_reward_list = [] # this list contains the total reward per episode if agent is None: actor = RandomAgent(n_actions) else: actor = agent for i in range(50): # Reset enviroment data and initialize variables done = False state = env.reset() total_episode_reward = 0. while not done: # Choose action at random. if agent is None: action = actor.forward(state) else: action = actor.forward( torch.tensor( [state], dtype=torch.float32))[0].cpu().detach().numpy() next_state, reward, done, _ = env.step(action) # Update episode reward total_episode_reward += reward # Update state for next iteration state = next_state # Append episode reward and total number of steps episode_reward_list.append(total_episode_reward) # Close environment env.close() return episode_reward_list
# Import and initialize Mountain Car Environment env = gym.make('LunarLanderContinuous-v2') env.reset() # Parameters N_episodes = 100 # Number of episodes to run for training discount_factor = 0.95 # Value of gamma n_ep_running_average = 50 # Running average of 50 episodes m = len(env.action_space.high) # dimensionality of the action # Reward episode_reward_list = [] # Used to save episodes reward episode_number_of_steps = [] # Agent initialization agent = RandomAgent(m) # Training process EPISODES = trange(N_episodes, desc='Episode: ', leave=True) for i in EPISODES: # Reset enviroment data done = False state = env.reset() total_episode_reward = 0. t = 0 while not done: # Take a random action action = agent.forward(state) # Get next state and reward. The done variable
def main(): dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using", dev) # Import and initialize Mountain Car Environment env = gym.make('LunarLanderContinuous-v2') env.reset() # Parameters N_episodes = 300 # Number of episodes to run for training discount_factor = 0.99 # Value of gamma n_ep_running_average = 50 # Running average of 50 episodes dim_state = len(env.observation_space.high) # State dimensionality m = len(env.action_space.high) # dimensionality of the action lr_actor = 5 * pow(10, -5) # Actor network learning rate lr_critic = 5 * pow(10, -4) # Critic network learning rate d = 2 # Policy update frequency tau = pow(10, -3) # Tau constant mu = 0.15 sigma = 0.2 # Reward episode_reward_list = [] # Used to save episodes reward episode_number_of_steps = [] # Random agent initialization agent = AgentQ(m, dim_state, lr_actor, lr_critic, N_episodes, discount_factor, mu, sigma, tau, dev) # Initialize Buffer buffer = ExperienceReplayBuffer(maximum_length=L) random_agent = RandomAgent(m) state = env.reset() for _ in tqdm(range(L)): # Take a random action action = random_agent.forward(state) # Compute a random action next_state, reward, done, _ = env.step(action) experience = (state, action, reward, next_state, done ) # Create the experience buffer.append(experience) # Append the experience to the buffer state = next_state if done: state = env.reset() # Training process EPISODES = trange(N_episodes, desc='Episode: ', leave=True) for i in EPISODES: # Reset enviroment data and initialize variables done = False state = env.reset() total_episode_reward = 0. t = 0 agent.n = np.zeros(agent.m) # Reset noise in each episode while not done: # Take a random action action = agent.forward(state, None, grad=False) # Compute possible actions # Get next state and reward. The done variable # will be True if you reached the goal position, # False otherwise next_state, reward, done, _ = env.step(action) experience = (state, action, reward, next_state, done ) # Create the experience buffer.append(experience) # Append the experience to the buffer if len(buffer) >= N: # Sample N elements from the buffer states, actions, rewards, next_states, dones = buffer.sample_batch( n=N) actions = torch.tensor(actions, dtype=torch.float32, device=dev) mask = torch.tensor(np.multiply(dones, 1), device=dev).reshape(-1, 1) Q_prime = agent.forward_target(next_states) rewards_tensor = torch.tensor(rewards, device=dev).reshape(-1, 1) targets = (rewards_tensor + (1 - mask) * discount_factor * Q_prime).type( torch.float32) values = agent.forward(states, actions, grad=True) agent.backward(values, targets) if t % d == 0: agent.policy_backward(states) # Update episode reward total_episode_reward += reward # Update state for next iteration state = next_state t += 1 agent.noise() # Update noise # Append episode reward and total number of steps episode_reward_list.append(total_episode_reward) episode_number_of_steps.append(t) # Close environment env.close() # Updates the tqdm update bar with fresh information # (episode number, total reward of the last episode, total number of Steps # of the last episode, average reward, average number of steps) EPISODES.set_description( "Episode {} - Reward/Steps: {:.1f}/{} - Avg. Reward/Steps: {:.1f}/{}" .format( i, total_episode_reward, t, running_average(episode_reward_list, n_ep_running_average)[-1], running_average(episode_number_of_steps, n_ep_running_average)[-1])) # Save network torch.save(agent.actor_network, 'neural-network-2-actor.pth') torch.save(agent.critic_network, 'neural-network-2-critic.pth') # Plot Rewards and steps fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9)) ax[0].plot([i for i in range(1, N_episodes + 1)], episode_reward_list, label='Episode reward') ax[0].plot([i for i in range(1, N_episodes + 1)], running_average(episode_reward_list, n_ep_running_average), label='Avg. episode reward') ax[0].set_xlabel('Episodes') ax[0].set_ylabel('Total reward') ax[0].set_title('Total Reward vs Episodes') ax[0].legend() ax[0].grid(alpha=0.3) ax[1].plot([i for i in range(1, N_episodes + 1)], episode_number_of_steps, label='Steps per episode') ax[1].plot([i for i in range(1, N_episodes + 1)], running_average(episode_number_of_steps, n_ep_running_average), label='Avg. number of steps per episode') ax[1].set_xlabel('Episodes') ax[1].set_ylabel('Total number of steps') ax[1].set_title('Total number of steps vs Episodes') ax[1].legend() ax[1].grid(alpha=0.3) plt.savefig('Result_problem2.png') plt.show()