class Agent: def __init__(self, action_spec: dm_env.specs.DiscreteArray, observation_spec: dm_env.specs.Array, device: torch.device, settings: dict) -> None: """ Initializes the agent, constructs the qnet and the q_target, initializes the optimizer and ReplayMemory. Args: action_spec(dm_env.specs.DiscreteArray): description of the action space of the environment observation_spec(dm_env.specs.Array): description of observations form the environment device(str): "gpu" or "cpu" settings(dict): dictionary with settings """ self.device = device action_size = action_spec.num_values state_size = np.prod(observation_spec.shape) self.action_size = action_size self.state_size = state_size self.batch_size = settings['batch_size'] self.noisy_nets = settings['qnet_settings']['noisy_nets'] self.distributional = settings["qnet_settings"]["distributional"] if self.distributional: # Currently the distributional agent always uses Dueling DQN self.qnet = DistributionalDuelDQN(state_size, action_size, settings['qnet_settings'], device).to(device) self.q_target = DistributionalDuelDQN(state_size, action_size, settings['qnet_settings'], device).to(device) vmin, vmax = settings["qnet_settings"]["vmin"], settings[ "qnet_settings"]["vmax"] number_atoms = settings["qnet_settings"]["number_atoms"] self.distribution_updater = DistributionUpdater( vmin, vmax, number_atoms) else: if settings["duelling_dqn"]: self.qnet = DuelDQN(state_size, action_size, settings['qnet_settings']).to(device) self.q_target = DuelDQN(state_size, action_size, settings['qnet_settings']).to(device) else: self.qnet = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target = Dqn(state_size, action_size, settings['qnet_settings']).to(device) self.q_target.load_state_dict(self.qnet.state_dict()) self.optimizer = optim.Adam(self.qnet.parameters(), lr=settings['lr']) self.epsilon = settings["epsilon_start"] self.decay = settings["epsilon_decay"] self.epsilon_min = settings["epsilon_min"] self.gamma = settings['gamma'] self.start_optimization = settings["start_optimization"] self.update_qnet_every = settings["update_qnet_every"] self.update_target_every = settings["update_target_every"] self.number_steps = 0 self.ddqn = settings["ddqn"] # Initialize replay memory self.prioritized_replay = settings["prioritized_buffer"] if self.prioritized_replay: self.memory = PrioritizedReplayMemory( device, settings["buffer_size"], self.gamma, settings["n_steps"], settings["alpha"], settings["beta0"], settings["beta_increment"]) else: self.memory = ReplayMemory(device, settings["buffer_size"], self.gamma, settings["n_steps"]) return def policy(self, timestep: dm_env.TimeStep) -> int: """ Returns an action following an epsilon-greedy policy. Args: timestep(dm_env.TimeStep): An observation from the environment Returns: int: The chosen action. """ observation = np.array(timestep.observation).flatten() observation = torch.from_numpy(observation).float().to(self.device) self.number_steps += 1 if not self.noisy_nets: self.update_epsilon() if np.random.rand() < self.epsilon: return np.random.choice(self.action_size) else: return int(self.qnet.get_max_action(observation)) def update_epsilon(self) -> None: """ Decays epsilon until self.epsilon_min Returns: None """ if self.epsilon > self.epsilon_min: self.epsilon *= self.decay @staticmethod def calc_loss( q_observed: torch.Tensor, q_target: torch.Tensor, weights: torch.Tensor) -> typing.Tuple[torch.Tensor, np.float64]: """ Returns the mean weighted MSE loss and the loss for each sample Args: q_observed(torch.Tensor): calculated q_value q_target(torch.Tensor): target q-value weights: weights of the batch samples Returns: tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample """ losses = functional.mse_loss(q_observed, q_target, reduction='none') loss = (weights * losses).sum() / weights.sum() return loss, losses.cpu().detach().numpy() + 1e-8 @staticmethod def calc_distributional_loss( dist: torch.Tensor, proj_dist: torch.Tensor, weights: torch.Tensor, ) -> typing.Tuple[torch.Tensor, np.float64]: """ Calculates the distributional loss metric. Args: dist(torch.Tensor): The observed distribution proj_dist: The projected target distribution weights: weights of the batch samples Returns: tuple(torch.Tensor, np.float64): mean squared error loss, loss for each indivdual sample """ losses = -functional.log_softmax(dist, dim=1) * proj_dist losses = weights * losses.sum(dim=1) return losses.mean(), losses.cpu().detach().numpy() + 1e-8 def update(self, step: dm_env.TimeStep, action: int, next_step: dm_env.TimeStep) -> None: """ Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network. Args: step(dm_env.TimeStep): Current observation from the environment action(int): The action that was performed by the agent. next_step(dm_env.TimeStep): Next observation from the environment Returns: None """ observation = np.array(step.observation).flatten() next_observation = np.array(next_step.observation).flatten() done = next_step.last() exp = Experience(observation, action, next_step.reward, next_step.discount, next_observation, 0, done) self.memory.add(exp) if self.memory.number_samples() < self.start_optimization: return if self.number_steps % self.update_qnet_every == 0: s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch( self.batch_size) if not self.distributional: self.optimization_step(s0, a0, n_step_reward, discount, s1, indices, weights) else: self.distributional_optimization_step(s0, a0, n_step_reward, discount, s1, dones, indices, weights) if self.number_steps % self.update_target_every == 0: self.q_target.load_state_dict(self.qnet.state_dict()) return def optimization_step(self, s0: torch.Tensor, a0: torch.Tensor, n_step_reward: torch.Tensor, discount: torch.Tensor, s1: torch.Tensor, indices: typing.Optional[torch.Tensor], weights: typing.Optional[torch.Tensor]) -> None: """ Calculates the Bellmann update and updates the qnet. Args: s0(torch.Tensor): current state a0(torch.Tensor): current action n_step_reward(torch.Tensor): n-step reward discount(torch.Tensor): discount factor s1(torch.Tensor): next state indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet. weights(torch.Tensor): weights needed for prioritized replay Returns: None """ with torch.no_grad(): if self.noisy_nets: self.q_target.reset_noise() self.qnet.reset_noise() # Calculating the target values next_q_vals = self.q_target(s1) if self.ddqn: a1 = torch.argmax(self.qnet(s1), dim=1).unsqueeze(-1) next_q_val = next_q_vals.gather(1, a1).squeeze() else: next_q_val = torch.max(next_q_vals, dim=1).values q_target = n_step_reward.squeeze( ) + self.gamma * discount.squeeze() * next_q_val # Getting the observed q-values if self.noisy_nets: self.qnet.reset_noise() q_observed = self.qnet(s0).gather(1, a0.long()).squeeze() # Calculating the losses if not self.prioritized_replay: weights = torch.ones(self.batch_size) critic_loss, batch_loss = self.calc_loss(q_observed, q_target, weights) # Backpropagation of the gradients self.optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5) self.optimizer.step() # Update replay memory self.memory.update_priorities(indices, batch_loss) return def distributional_optimization_step( self, s0: torch.Tensor, a0: torch.Tensor, n_step_reward: torch.Tensor, discount: torch.Tensor, s1: torch.Tensor, dones: torch.Tensor, indices: typing.Optional[torch.Tensor], weights: typing.Optional[torch.Tensor]) -> None: """ Calculates the Bellmann update and updates the qnet for the distributional agent. Args: s0(torch.Tensor): current state a0(torch.Tensor): current action n_step_reward(torch.Tensor): n-step reward discount(torch.Tensor): discount factor s1(torch.Tensor): next state dones(torch.Tensor): done indices(torch.Tensor): batch indices, needed for prioritized replay. Not used yet. weights(torch.Tensor): weights needed for prioritized replay Returns: None """ with torch.no_grad(): gamma = self.gamma * discount if self.noisy_nets: self.q_target.reset_noise() self.qnet.reset_noise() # Calculating the target distributions next_dists, next_q_vals = self.q_target.calc(s1) if self.ddqn: a1 = self.qnet.get_max_action(s1) else: a1 = torch.max(next_q_vals, dim=1) distributions = next_dists[range(self.batch_size), a1] distributions = functional.softmax(distributions, dim=1) q_target = self.distribution_updater.update_distribution( distributions.cpu().detach().numpy(), n_step_reward.cpu().detach().numpy(), dones.cpu().detach().numpy(), gamma.cpu().detach().numpy()) q_target = torch.tensor(q_target).to(self.device) # Getting the observed q-value distributions if self.noisy_nets: self.qnet.reset_noise() q_observed = self.qnet(s0) q_observed = q_observed[range(self.batch_size), a0.squeeze().long()] # Calculating the losses if not self.prioritized_replay: weights = torch.ones(self.batch_size) critic_loss, batch_loss = self.calc_distributional_loss( q_observed, q_target, weights) # Backpropagation of the gradients self.optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.qnet.parameters(), 5) self.optimizer.step() # Update replay memory self.memory.update_priorities(indices, batch_loss) return
class T3DAgent: def __init__(self, env, brain, brain_name, device, settings): self.env = env self.brain_name = brain_name self.device = device action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations state_size = states.shape[1] self.action_size = action_size self.state_size = state_size self.batch_size = settings['batch_size'] # Initialize actor local and target networks self.actor_local = Actor(state_size, action_size, settings['actor_settings']).to(device) self.actor_target = Actor(state_size, action_size, settings['actor_settings']).to(device) self.actor_target.load_state_dict(self.actor_local.state_dict()) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=settings['lr_actor']) # Initialize critic networks self.critic_local = Critic(state_size, action_size, settings['critic_settings']).to(device) self.critic_target = Critic(state_size, action_size, settings['critic_settings']).to(device) self.critic_target.load_state_dict(self.critic_local.state_dict()) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=settings['lr_critic']) # Save some of the settings into class member variables self.pretrain_steps = settings['pretrain_steps'] self.gamma = settings['gamma'] self.tau = settings['tau'] self.action_noise = settings['action_noise'] self.action_clip = settings['action_clip'] self.target_action_noise = settings['target_action_noise'] self.target_noise_clip = settings['target_noise_clip'] self.optimize_every = settings['optimize_critic_every'] # Initialize replay memory and episode generator self.memory = ReplayMemory(device, settings['buffer_size']) self.generator = self.play_episode() self.number_steps = 0 return def get_action_noise(self): return self.action_noise def set_action_noise(self, std): self.action_noise = std return def pretrain(self): # The idea of using a pretrain phase before starting regular episodes # is from https://github.com/whiterabbitobj/Continuous_Control/ print("Random sampling of " + str(self.pretrain_steps) + " steps") env = self.env brain_name = self.brain_name env_info = env.reset(train_mode=True)[brain_name] number_agents = env_info.vector_observations.shape[0] for _ in range(self.pretrain_steps): actions = [] states = env_info.vector_observations for _ in range(number_agents): actions.append(np.random.uniform(-1, 1, self.action_size)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(Experience(state, action, reward, next_state, done)) if np.any(dones): env_info = env.reset(train_mode=True)[brain_name] def play_episode(self, train_mode = True): # The idea of generating episodes in an "experience generator" is from # "Deep Reinforcement Learning Hands-On" by Maxim Lapan print("Starting episode generator") # Initialize the environment env = self.env brain_name = self.brain_name env_info = env.reset(train_mode=train_mode)[brain_name] # Initialize episode_rewards and get the first state episode_rewards = [] # Run episode step by step while True: states = env_info.vector_observations with torch.no_grad(): actions = self.actor_local.forward( torch.from_numpy(states).type(torch.FloatTensor).to(self.device)).cpu().detach().numpy() actions += self.action_noise * np.random.normal(size=actions.shape) actions = np.clip(actions, -self.action_clip, self.action_clip) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done episode_rewards.append(rewards) for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(Experience(state, action, reward, next_state, done)) if np.any(dones): agent_reward = np.sum(episode_rewards, axis=0) std_reward = np.std(agent_reward) mean_reward = np.mean(agent_reward) episode_rewards = [] env_info = env.reset(train_mode=True)[brain_name] yield mean_reward, std_reward else: yield -1, -1 def take_step(self, train_mode = True): return next(self.generator, train_mode) def learn(self): self.number_steps += 1 if self.memory.number_samples() <= self.batch_size: return # states, actions, rewards, next states, done s0, a0, r, s1, d = self.memory.sample_batch(self.batch_size) critic_loss_a, critic_loss_b = self.optimize_critic(s0, a0, r, s1, d) actor_loss = self.optimize_actor(s0) return actor_loss, critic_loss_a, critic_loss_b def optimize_actor(self, s0): # Calc policy loss if self.number_steps % self.optimize_every == 0: a0_pred = self.actor_local(s0) actor_loss = -self.critic_local.get_qa(s0, a0_pred).mean() # Update actor nn self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # slow update self.slow_update(self.tau) return -actor_loss.cpu().detach().numpy() return 0 def optimize_critic(self, s0, a0, r, s1, d): # The ideas of adding noise to the next state a1 as well as the critic loss, that takes q1_expected and # q2_expected as arguments at the same time, are from the implementation of the authors of the TD3 manuscript # at https://github.com/sfujim/TD3/ with torch.no_grad(): # calc critic loss noise = torch.randn_like(a0).to(self.device) noise = noise * torch.tensor(self.target_action_noise).expand_as(noise).to(self.device) noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip) a1 = (self.actor_target(s1) + noise).clamp(-self.action_clip, self.action_clip) qa_target, qb_target = self.critic_target(s1, a1) q_target = torch.min(qa_target, qb_target) q_target = r + self.gamma * (1.0 - d) * q_target qa_expected, qb_expected = self.critic_local(s0, a0) critic_loss_a = functional.mse_loss(qa_expected, q_target) critic_loss_b = functional.mse_loss(qb_expected, q_target) critic_loss = critic_loss_a + critic_loss_b # Update critic nn self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() return critic_loss_a.cpu().detach().numpy(), critic_loss_b.cpu().detach().numpy() def slow_update(self, tau): for target_par, local_par in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data) for target_par, local_par in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_par.data.copy_(tau * local_par.data + (1.0 - tau) * target_par.data) return def load_nets(self, actor_file_path, critic_file_path): self.actor_local.load_state_dict(torch.load(actor_file_path)) self.actor_local.eval() self.critic_local.load_state_dict(torch.load(critic_file_path)) self.critic_local.eval() return def save_nets(self, model_save_path): actor_path = model_save_path + "_actor_net.pt" torch.save(self.actor_local.state_dict(), actor_path) critic_path = model_save_path + "_critic_net.pt" torch.save(self.critic_local.state_dict(), critic_path) return