def __init__(self, env: gym.Env): self.env = env self.observations = self.env.observation_space.shape self.actions = self.env.action_space.n self.gamma = 0.95 self.learning_rate_actor = 1e-3 # 0.001 self.learning_rate_critic = 5e-3 # 0.005 self.actor = Actor(self.observations, self.actions, self.learning_rate_actor) self.critic = Critic(self.observations, self.actions, self.learning_rate_critic)
def __init__(self, env: gym.Env): self.env = env self.num_observations = self.env.observation_space.shape self.num_actions = self.env.action_space.n self.num_values = 1 self.gamma = 0.95 self.learning_rate_actor = 1e-3 # 0.001 self.learning_rate_critic = 5e-3 # 0.005 self.actor = Actor(self.num_observations, self.num_actions, self.learning_rate_actor) self.critic = Critic(self.num_observations, self.num_values, self.learning_rate_critic) self.batch_size = 16 self.memory_size = 16 self.memory = collections.deque(maxlen=self.memory_size)
class Agent: def __init__(self, env: gym.Env): self.env = env self.observations = self.env.observation_space.shape self.actions = self.env.action_space.n self.gamma = 0.95 self.learning_rate_actor = 1e-3 # 0.001 self.learning_rate_critic = 5e-3 # 0.005 self.actor = Actor(self.observations, self.actions, self.learning_rate_actor) self.critic = Critic(self.observations, self.actions, self.learning_rate_critic) def get_action(self, state: np.ndarray): if np.random.rand() <= self.epsilon: return np.random.randint(self.actions) else: return np.argmax(self.dqn(state)) def train(self, num_episodes: int): last_rewards: Deque = collections.deque(maxlen=5) best_reward_mean = 0.0 for episode in range(1, num_episodes + 1): total_reward = 0.0 state = self.env.reset() state = np.reshape(state, newshape=(1, -1)).astype(np.float32) while True: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32) if done and total_reward < 499: reward = -100.0 self.remember(state, action, reward, next_state, done) self.replay() total_reward += reward state = next_state if done: if total_reward < 500: total_reward += 100.0 print( f"Episode: {episode} Reward: {total_reward} Epsilon: {self.epsilon}" ) last_rewards.append(total_reward) current_reward_mean = np.mean(last_rewards) if current_reward_mean > best_reward_mean: best_reward_mean = current_reward_mean self.actor.save_model(ACTOR_PATH) self.critic.save_model(CRITIC_PATH) print(f"New best mean: {best_reward_mean}") if best_reward_mean > 400: return break def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay def replay(self): if len(self.memory) < self.train_start: return minibatch = random.sample(self.memory, self.batch_size) states, actions, rewards, states_next, dones = zip(*minibatch) states = np.concatenate(states).astype(np.float32) states_next = np.concatenate(states_next).astype(np.float32) q_values = self.dqn(states) q_values_next = self.target_dqn(states_next) for i in range(self.batch_size): a = actions[i] done = dones[i] if done: q_values[i][a] = rewards[i] else: q_values[i][a] = rewards[i] + self.gamma * np.max( q_values_next[i]) self.dqn.fit(states, q_values) def play(self, num_episodes: int, render: bool = True): self.actor.load_model(ACTOR_PATH) self.critic.load_model(CRITIC_PATH) for episode in range(1, num_episodes + 1): total_reward = 0.0 state = self.env.reset() state = np.reshape(state, newshape=(1, -1)).astype(np.float32) while True: if render: self.env.render() action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32) total_reward += reward state = next_state if done: print(f"Episode: {episode} Reward: {total_reward}") break
class Agent: def __init__(self, env: gym.Env): self.env = env self.num_observations = self.env.observation_space.shape self.num_actions = self.env.action_space.n self.num_values = 1 self.gamma = 0.95 self.learning_rate_actor = 1e-3 # 0.001 self.learning_rate_critic = 5e-3 # 0.005 self.actor = Actor( self.num_observations, self.num_actions, self.learning_rate_actor ) self.critic = Critic( self.num_observations, self.num_values, self.learning_rate_critic ) def get_action(self, state: np.ndarray): policy = self.actor(state)[0] action = np.random.choice(self.num_actions, p=policy) return action def update_policy(self, state, action, reward, next_state, done): values = np.zeros(shape=(1, self.num_values)) # (1, 1) advantages = np.zeros(shape=(1, self.num_actions)) # (1, 2) value = self.critic(state)[0] next_value = self.critic(next_state)[0] if done: advantages[0][action] = reward - value values[0][0] = reward else: advantages[0][action] = (reward + self.gamma * next_value) - value values[0][0] = reward + self.gamma * next_value self.actor.fit(state, advantages) self.critic.fit(state, values) def train(self, num_episodes: int): last_rewards: Deque = collections.deque(maxlen=5) for episode in range(1, num_episodes + 1): total_reward = 0.0 state = self.env.reset() state = np.reshape(state, newshape=(1, -1)).astype(np.float32) while True: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32) if done and total_reward < 499: reward = -100.0 self.update_policy(state, action, reward, next_state, done) total_reward += reward state = next_state if done: if total_reward < 500: total_reward += 100.0 last_rewards.append(total_reward) current_reward_mean = np.mean(last_rewards) print(f"Episode: {episode} Reward: {total_reward} MeanReward: {current_reward_mean}") if current_reward_mean > 400: self.actor.save_model(ACTOR_PATH) self.critic.save_model(CRITIC_PATH) return break self.actor.save_model(ACTOR_PATH) self.critic.save_model(CRITIC_PATH) def play(self, num_episodes: int, render: bool = True): self.actor.load_model(ACTOR_PATH) self.critic.load_model(CRITIC_PATH) for episode in range(1, num_episodes + 1): total_reward = 0.0 state = self.env.reset() state = np.reshape(state, newshape=(1, -1)).astype(np.float32) while True: if render: self.env.render() action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32) total_reward += reward state = next_state if done: print(f"Episode: {episode} Reward: {total_reward}") break
class Agent: def __init__(self, env: gym.Env): self.env = env self.num_observations = self.env.observation_space.shape self.num_actions = self.env.action_space.n self.num_values = 1 self.gamma = 0.95 self.learning_rate_actor = 1e-3 # 0.001 self.learning_rate_critic = 5e-3 # 0.005 self.actor = Actor(self.num_observations, self.num_actions, self.learning_rate_actor) self.critic = Critic(self.num_observations, self.num_values, self.learning_rate_critic) self.batch_size = 16 self.memory_size = 16 self.memory = collections.deque(maxlen=self.memory_size) def get_action(self, state: np.ndarray): policy = self.actor(state)[0] action = np.random.choice(self.num_actions, p=policy) return action def update_policy(self): # make a batch see https://github.com/cedricmoullet/CAS_AI_2020_2021/blob/main/20210202_DQN/cartPoleDqnAgent.py values = np.zeros(shape=(self.batch_size, self.num_values)) # (1, 1) advantages = np.zeros(shape=(self.batch_size, self.num_actions)) # (1, 2) states, actions, rewards, states_next, dones = zip(*self.memory) value = self.critic(np.concatenate(states)) next_value = self.critic(np.concatenate(states_next)) for i in range(self.batch_size): action = actions[i] done = dones[i] if done: advantages[i][action] = rewards[i] - value[i] values[i][0] = rewards[i] else: advantages[i][action] = (rewards[i] + self.gamma * next_value[i]) - value[i] values[i][ 0] = rewards[i] + self.gamma * next_value[i] - value[i] self.actor.fit( np.array(states).reshape(self.memory_size, 4), advantages) self.critic.fit(np.array(states).reshape(self.memory_size, 4), values) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def train(self, num_episodes: int): last_rewards: Deque = collections.deque(maxlen=5) all_rewards = [] for episode in range(1, num_episodes + 1): self.memory.clear() total_reward = 0.0 state = self.env.reset() state = np.reshape(state, newshape=(1, -1)).astype(np.float32) mem = 0 while True: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32) if done and total_reward < 499: reward = -100.0 self.remember(state, action, reward, next_state, done) mem += 1 if mem == self.memory_size: self.update_policy() mem = 0 self.memory.clear() total_reward += reward state = next_state if done: if total_reward < 500: total_reward += 100.0 last_rewards.append(total_reward) current_reward_mean = np.mean(last_rewards) all_rewards.append(total_reward) print( f"Episode: {episode} Reward: {total_reward} MeanReward: {current_reward_mean}" ) if current_reward_mean > 400: self.actor.save_model(ACTOR_PATH) self.critic.save_model(CRITIC_PATH) return break self.actor.save_model(ACTOR_PATH) self.critic.save_model(CRITIC_PATH) plt.plot(all_rewards) plt.show() def play(self, num_episodes: int, render: bool = True): self.actor.load_model(ACTOR_PATH) self.critic.load_model(CRITIC_PATH) episode_rewards = [] for episode in range(1, num_episodes + 1): total_reward = 0.0 state = self.env.reset() state = np.reshape(state, newshape=(1, -1)).astype(np.float32) while True: if render: self.env.render() action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, newshape=(1, -1)).astype(np.float32) total_reward += reward state = next_state if done: print(f"Episode: {episode} Reward: {total_reward}") episode_rewards.append(total_reward) break print( f"Mean reward from {num_episodes} episodes = {np.mean(episode_rewards)}" ) print(f"Best reward = {np.max(episode_rewards)}")