class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network 1 (w/ Target Network1) self.critic1_local = Critic(state_size, action_size, random_seed).to(device) self.critic1_target = Critic(state_size, action_size, random_seed).to(device) self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Critic Network 2 (w/ Target Network2) self.critic2_local = Critic(state_size, action_size, random_seed).to(device) self.critic2_target = Critic(state_size, action_size, random_seed).to(device) self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize time step (for updating every UPDATE_EVERY and LEARN_EVERY steps) self.t_step = 0 def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory.""" for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: self.learn() def act(self, state): """Returns actions for given states as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value """ self.t_step += 1 states, actions, rewards, next_states, dones = self.memory.sample() # ---------------------------- update critic ---------------------------- # # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action if POLICY_NOISE > 0.0: noise = torch.empty_like(actions).data.normal_( 0, POLICY_NOISE).to(device) noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP) # Get predicted next-state actions and Q values from target models actions_next = (self.actor_target(next_states) + noise).clamp( -1., 1.) else: # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Error Mitigation Q1_target = self.critic1_target(next_states, actions_next) Q2_target = self.critic2_target(next_states, actions_next) Q_targets_next = torch.min(Q1_target, Q2_target) # Compute Q targets for current states (y_i) Q_targets = rewards + dones * GAMMA * Q_targets_next # Compute critic1 loss Q1_expected = self.critic1_local(states, actions) critic1_loss = F.mse_loss(Q1_expected, Q_targets) # Minimize the loss self.critic1_optimizer.zero_grad() critic1_loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1) self.critic1_optimizer.step() # Compute critic2 loss Q2_expected = self.critic2_local(states, actions) critic2_loss = F.mse_loss(Q2_expected, Q_targets) # Minimize the loss self.critic2_optimizer.zero_grad() critic2_loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1) self.critic2_optimizer.step() # Delayed Policy Updates if self.t_step % UPDATE_ACTOR_EVERY == 0: # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic1_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic1_local, self.critic1_target, TAU) self.soft_update(self.critic2_local, self.critic2_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_models(self): torch.save(self.actor_local.state_dict(), actor_solved_model) torch.save(self.critic1_local.state_dict(), critic1_solved_model) torch.save(self.critic2_local.state_dict(), critic2_solved_model)
class AgentGroupVersion3(BaseAgent): def __init__(self, agent_list, action_size, learn_period=10, learn_sampling_num=20, buffer_size=int(1e6), batch_size=128, random_seed=0): super().__init__() if len(agent_list) == 0: raise Exception('len(agent_list) = 0') self.agent_list = agent_list self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num self.batch_size = batch_size self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) self.time_step = 0 # debugging constant self.__debug_num_agents = len(agent_list) self.__debug_state_size = agent_list[0].state_size self.__debug_action_size = agent_list[0].action_size def act(self, states, add_noise=True): """ Predict actions given states. Args: states (numpy.array): states.shape[0] = num_agents Returns: actions (numpy.array): actions.shape[0] = num_agents. """ # assert (states.shape[0] == self.__debug_num_agents), 'Mismatch dim of states.shape[0]' actions = None for s, agent in zip(states, self.agent_list): s = np.expand_dims(s, axis=0) # pdb.set_trace() action = agent.act(s) # expand dim from (2,) to (1, 2) # action = np.expand_dims(action, axis=0) if actions is None: actions = action else: actions = np.append(actions, action, axis=0) # pdb.set_trace() # assert (actions.shape[0] == self.__debug_num_agents), 'Mismatch dim of actions.shape[0]' # assert (actions.shape[0] == self.__debug_action_size), 'Mismatch dim of actions.shape[0]' return actions def step(self, states, actions, rewards, next_states, dones): # flatten states, action, rewards, next_states, dones p = pack_experience(states, actions, rewards, next_states, dones) # pdb.set_trace() self.memory.add(*p) # pdb.set_trace() if (len(self.memory) > self.batch_size) and (self.time_step % self.learn_period == 0): for _ in range(self.learn_sampling_num): for agent in self.agent_list: # pdb.set_trace() # Note: experiences.shape[0] = batch_size experiences = self.memory.sample() # pdb.set_trace() agent.step(*experiences) # update targets in each agent. for agent in self.agent_list: agent.update_targets() # pdb.set_trace() self.time_step += 1 def reset(self): for agent in self.agent_list: agent.reset() def model_dicts(self): merged_dicts = {} for agent in self.agent_list: merged_dicts = {**merged_dicts, **agent.model_dicts()} return merged_dicts
class DDPGAgentVersion1(BaseAgent): def __init__(self, state_size, action_size, random_seed, lr_actor=1e-2, lr_critic=1e-2, fc1_units=128, fc2_units=128, buffer_size=int(1e6), batch_size=50, gamma=0.95, tau=1e-2, max_norm=1.0, learn_period=100, learn_sampling_num=50, adam_critic_weight_decay=0.0, name=None, exploration_mu=0.0, exploration_sigma=0.2, exploration_theta=0.15): """Initialize an Agent object. Args: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ super().__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.max_norm = max_norm self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num self.actor_local = DDPGActorVersion1(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = DDPGActorVersion1(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = DDPGCriticVersion1(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_target = DDPGCriticVersion1(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=lr_critic, weight_decay=adam_critic_weight_decay) # Noise process for action # Noise process # self.exploration_mu = 0 # self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016) # self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016) self.exploration_mu = exploration_mu self.exploration_theta = exploration_theta # (Timothy Lillicrap, 2016) self.exploration_sigma = exploration_sigma # (Timothy Lillicrap, 2016) self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size self.name = name self.time_step = 0 def step(self, state, action, reward, next_state, done): self.time_step += 1 """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size) and (self.time_step % self.learn_period == 0): for _ in range(self.learn_sampling_num): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # train critic # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current) actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm) self.critic_optimizer.step() # train actor (policy gradient) actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update critic_target self.soft_update(self.critic_local, self.critic_target, self.tau) # update actor_target self.soft_update(self.actor_local, self.actor_target, self.tau) #------ update noise ---# self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def model_dicts(self): return { 'agent_{}_actor'.format(self.name): self.actor_target, 'agent_{}_critic'.format(self.name): self.critic_target }
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def save_models(self): torch.save(self.actor_local.state_dict(), actor_solved_model) torch.save(self.critic_local.state_dict(), critic_solved_model)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, gamma=0.99, step_size=1, dueling_dqn=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network if dueling_dqn: print("Use dueling dqn") self.qnetwork_local = NoisyDuelingDQN(state_size, action_size, seed).to(device) self.qnetwork_target = NoisyDuelingDQN(state_size, action_size, seed).to(device) else: print("Use non-dueling dqn") self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.gamma = gamma self.step_size = step_size def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Compute and minimize loss # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states ## gamma ^ step_size for nstep dqn Q_targets = rewards + (pow(self.gamma, self.step_size) * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DoubleDDQNAgent(): "Implementes a Double Dueling DQN Agent" def __init__(self, state_size, action_size, seed, checkpoint=None): """ Contructor :param state_size: :param action_size: :param seed: :param checkpoint: if running from a checkpoint """ self.state_size = state_size self.action_size = action_size self.seed = np.random.seed(seed) # As for any DQN implementation we create a local and a target Network. # In this Case we use the DuelingDQN Implementation for both networks self.qnetwork_local = DuelingDQNetwork(state_size, action_size, seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS).to(device) self.qnetwork_target = DuelingDQNetwork(state_size, action_size, seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if checkpoint: #If We have a checkpoint we load the state to the networks and optimizers print('Using Checkpoint...') self.qnetwork_local.load_state_dict(checkpoint['local_state_dict']) self.qnetwork_target.load_state_dict( checkpoint['target_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): """ Main Step function for the agent. Every UPDATE_EVERY time it runs a learning step :param state: :param action: :param reward: :param next_state: :param done: """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions using an epsilong greedy approach. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if np.random.rand() > eps: return action_values.max(dim=1)[1].item() else: return np.random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]):A batch of experiences gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # For DoubleQN use the local network to find the best action q_local_argmax = self.qnetwork_local(states).max(1)[1].unsqueeze(1) # GeEvaluate next state from target using best actions estimated from local Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, q_local_argmax) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(AgentABC): def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an DDPG Agent object. :param state_size (int): dimension of each state :param action_size (int): dimension of each action :param num_agents (int): number of agents in environment ot use ddpg :param random_seed (int): random seed """ super().__init__(state_size, action_size, num_agents, random_seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for each agent self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # debug of the MSE critic loss self.mse_error_list = [] def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for agent in range(self.num_agents): self.memory.add(states[agent, :], actions[agent, :], rewards[agent], next_states[agent, :], dones[agent]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) self.debug_loss = np.mean(self.mse_error_list) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) acts = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent in range(self.num_agents): acts[agent, :] = self.actor_local( state[agent, :]).cpu().data.numpy() self.actor_local.train() if add_noise: noise = self.noise.sample() acts += noise return np.clip(acts, -1, 1) def reset(self): """ see abstract class """ super().reset() self.noise.reset() self.mse_error_list = [] def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards.view(BATCH_SIZE, -1) + (GAMMA * Q_targets_next * (1 - dones).view(BATCH_SIZE, -1)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.mse_error_list.append(critic_loss.detach().cpu().numpy()) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load_weights(self, directory_path): """ see abstract class """ super().load_weights(directory_path) self.actor_target.load_state_dict( torch.load(os.path.join(directory_path, an_filename), map_location=device)) self.critic_target.load_state_dict( torch.load(os.path.join(directory_path, cn_filename), map_location=device)) self.actor_local.load_state_dict( torch.load(os.path.join(directory_path, an_filename), map_location=device)) self.critic_local.load_state_dict( torch.load(os.path.join(directory_path, cn_filename), map_location=device)) def save_weights(self, directory_path): """ see abstract class """ super().save_weights(directory_path) torch.save(self.actor_local.state_dict(), os.path.join(directory_path, an_filename)) torch.save(self.critic_local.state_dict(), os.path.join(directory_path, cn_filename)) def save_mem(self, directory_path): """ see abstract class """ super().save_mem(directory_path) self.memory.save(os.path.join(directory_path, "ddpg_memory")) def load_mem(self, directory_path): """ see abstract class """ super().load_mem(directory_path) self.memory.load(os.path.join(directory_path, "ddpg_memory"))
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, state_size, action_size, actor_lr, critic_lr, random_seed, mu, theta, sigma, buffer_size, batch_size, gamma, tau, n_time_steps, n_learn_updates, device): self.state_size = state_size self.action_size = action_size self.actor_lr = actor_lr self.critic_lr = critic_lr # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, name="Actor_local") self.actor_target = Actor(state_size, action_size, name="Actor_target") self.actor_optimizer = Adam(learning_rate=self.actor_lr) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, name="Critic_local") self.critic_target = Critic(state_size, action_size, name="Critic_target") self.critic_optimizer = Adam(learning_rate=self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(action_size, random_seed, mu, theta, sigma) # Replay memory self.batch_size = int(batch_size) self.buffer_size = int(buffer_size) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, random_seed) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.n_time_steps = n_time_steps # number of time steps before updating network parameters self.n_learn_updates = n_learn_updates # number of updates per learning step # Device self.device = device def reset(self): """Reset the agent.""" self.noise.reset() def step(self, time_step, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state[:], action[:], reward, next_state[:], done) if time_step % self.n_time_steps != 0: return # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: # Train the network for a number of epochs specified by the parameter for i in range(self.n_learn_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = np.expand_dims(state, axis=0) action = self._act_tf(tf.constant(state)) action = action.numpy()[0] if add_noise: action += self.noise.sample() action = action.clip(-1, 1) return action @tf.function def _act_tf(self, state): return self.actor_local.model(state) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences : tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ self._learn_tf(experiences, tf.constant(self.gamma, dtype=tf.float64)) @tf.function def _learn_tf(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with tf.GradientTape() as tape: # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model(next_states) Q_targets_next = self.critic_target.model( [next_states, actions_next]) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local.model([states, actions]) critic_loss = MSE(Q_expected, Q_targets) # Minimize the loss critic_grad = tape.gradient( critic_loss, self.critic_local.model.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic_local.model.trainable_variables)) # ---------------------------- update actor ---------------------------- # with tf.GradientTape() as tape: # Compute actor loss actions_pred = self.actor_local.model(states) actor_loss = -tf.reduce_mean( self.critic_local.model([states, actions_pred])) # Minimize the loss actor_grad = tape.gradient(actor_loss, self.actor_local.model.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor_local.model.trainable_variables)) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local.model, self.critic_target.model, self.tau) self.soft_update(self.actor_local.model, self.actor_target.model, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: TF2 model target_model: TF2 model tau (float): interpolation parameter """ for target_var, local_var in zip(target_model.weights, local_model.weights): target_var.assign(tau * local_var + (1.0 - tau) * target_var)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get targets by doing a forward pass of the next states in the target network self.qnetwork_target.eval() with torch.no_grad(): Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0] # distinguish the cases in which next states are terminal and those which are not # for the first case the targets are only the one-step rewards Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # get outputs by forward pass of states in the local network # Note: our qnetwork for a given state all action values for that state. # However, for each state we know what action to do, so we gather all corresponding action values self.qnetwork_local.train() Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # compute the mean squared error of the Bellman Eq. loss = F.mse_loss(Q_expected, Q_targets) # clear gradients buffer from previous iteration self.optimizer.zero_grad() # backprop error through local network loss.backward() # update weights of local network by taking one SGD step self.optimizer.step() # update target network by copying the latest weights of the locat network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = tau*θ_local + (1 - tau)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, args, device): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.hidden_size = args.hidden_size self.seed = args.seed self.args = args self.device = device assert self.args.agent in ['dqn', 'double_dqn', 'dueling_dqn'],\ "invalid agent name" if self.args.agent == "double_dqn": print("Implementing Double DQN!") elif self.args.agent == "dueling_dqn": print("Implementing Dueling DQN!") else: print("Implementing DQN") # Q-Network if self.args.agent == "dueling_dqn": self.qnetwork_local = DuelingQNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) print("Agent Architecture") print(self.qnetwork_local) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.args.lr) # Replay memory self.memory = ReplayBuffer(action_size, args.buffer_size, args.batch_size, self.seed, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = args.update_frequency def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_FREQUENCY time steps. self.t_step = (self.t_step + 1) % self.args.update_frequency if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.args.batch_size: experiences = self.memory.sample() self.learn(experiences, self.args.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model if self.args.agent == "double_dqn": next_actions = torch.argmax(self.qnetwork_local(next_states), dim=1).unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, next_actions) else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.args.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(AgentABC): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an MADDPG Agent object. Params ====== :param state_size: dimension of each state :param action_size: dimension of each action :param num_agents: number of inner agents :param random_seed: random seed """ super().__init__(state_size, action_size, num_agents, random_seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.actors_local = [] self.actors_target = [] self.actor_optimizers = [] self.critics_local = [] self.critics_target = [] self.critic_optimizers = [] for i in range(num_agents): # Actor Network (w/ Target Network) self.actors_local.append( Actor(state_size, action_size, random_seed).to(device)) self.actors_target.append( Actor(state_size, action_size, random_seed).to(device)) self.actor_optimizers.append( optim.Adam(self.actors_local[i].parameters(), lr=LR_ACTOR)) # Critic Network (w/ Target Network) self.critics_local.append( Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device)) self.critics_target.append( Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device)) self.critic_optimizers.append( optim.Adam(self.critics_local[i].parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)) # Noise process for each agent self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # debugging variables self.step_count = 0 self.mse_error_list = [] def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory # in order to add some stability to the learning, we don't modify weights every turn. self.step_count += 1 if (self.step_count % UPDATE_EVERY) == 0: # learn every #UPDATE_EVERY steps for i in range(NUM_UPDATES): # update #NUM_UPDATES times if len(self.memory) > 1000: experiences = self.memory.sample() self.learn(experiences) self.debug_loss = np.mean(self.mse_error_list) self.update_target_networks() def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) acts = np.zeros((self.num_agents, self.action_size)) for agent in range(self.num_agents): self.actors_local[agent].eval() with torch.no_grad(): acts[agent, :] = self.actors_local[agent]( state[agent, :]).cpu().data.numpy() self.actors_local[agent].train() if add_noise: acts += self.noise.sample() return np.clip(acts, -1, 1) def reset(self): """ see abstract class """ super().reset() self.noise.reset() self.mse_error_list = [] def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_full_state, actors_target(next_partial_state) ) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states_batched, actions_batched, rewards, next_states_batched, dones = experiences states_concated = states_batched.view( [BATCH_SIZE, self.num_agents * self.state_size]) next_states_concated = next_states_batched.view( [BATCH_SIZE, self.num_agents * self.state_size]) actions_concated = actions_batched.view( [BATCH_SIZE, self.num_agents * self.action_size]) for agent in range(self.num_agents): actions_next_batched = [ self.actors_target[i](next_states_batched[:, i, :]) for i in range(self.num_agents) ] actions_next_whole = torch.cat(actions_next_batched, 1) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models q_targets_next = self.critics_target[agent](next_states_concated, actions_next_whole) # Compute Q targets for current states (y_i) q_targets = rewards[:, agent].view( BATCH_SIZE, -1) + (GAMMA * q_targets_next * (1 - dones[:, agent].view(BATCH_SIZE, -1))) # Compute critic loss q_expected = self.critics_local[agent](states_concated, actions_concated) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.critic_optimizers[agent].zero_grad() critic_loss.backward() self.critic_optimizers[agent].step() # save the error for statistics self.mse_error_list.append(critic_loss.detach().cpu().numpy()) # ---------------------------- update actor ---------------------------- # action_i = self.actors_local[agent](states_batched[:, agent, :]) actions_pred = actions_batched.clone() actions_pred[:, agent, :] = action_i actions_pred_whole = actions_pred.view(BATCH_SIZE, -1) # Compute actor loss actor_loss = -self.critics_local[agent](states_concated, actions_pred_whole).mean() # Minimize the loss self.actor_optimizers[agent].zero_grad() actor_loss.backward() self.actor_optimizers[agent].step() def update_target_networks(self): # ----------------------- update target networks ----------------------- # for agent in range(self.num_agents): self.soft_update(self.critics_local[agent], self.critics_target[agent], TAU) self.soft_update(self.actors_local[agent], self.actors_target[agent], TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load_weights(self, directory_path): """ see abstract class """ super().load_weights(directory_path) actor_weights = os.path.join(directory_path, an_filename) critic_weights = os.path.join(directory_path, cn_filename) for agent in range(self.num_agents): self.actors_target[agent].load_state_dict( torch.load(actor_weights + "_" + str(agent), map_location=device)) self.critics_target[agent].load_state_dict( torch.load(critic_weights + "_" + str(agent), map_location=device)) self.actors_local[agent].load_state_dict( torch.load(actor_weights + "_" + str(agent), map_location=device)) self.critics_local[agent].load_state_dict( torch.load(critic_weights + "_" + str(agent), map_location=device)) def save_weights(self, directory_path): """ see abstract class """ super().save_weights(directory_path) actor_weights = os.path.join(directory_path, an_filename) critic_weights = os.path.join(directory_path, cn_filename) for agent in range(self.num_agents): torch.save(self.actors_local[agent].state_dict(), actor_weights + "_" + str(agent)) torch.save(self.critics_local[agent].state_dict(), critic_weights + "_" + str(agent)) def save_mem(self, directory_path): """ see abstract class """ super().save_mem(directory_path) self.memory.save(os.path.join(directory_path, memory_filename)) def load_mem(self, directory_path): """ see abstract class """ super().load_mem(directory_path) self.memory.load(os.path.join(directory_path, memory_filename))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, double_DQN=False, prioritized_replay=False, dueling_networks=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_DQN (bool) : use double DQN prioritized_replay (bool): used prioritized_replay """ self.state_size = state_size self.action_size = action_size self.seed = seed self.tau = TAU self.double_DQN = double_DQN self.prioritized_replay = prioritized_replay self.dueling_networks = dueling_networks if self.dueling_networks: # Q-Networks - Local, Target Neural Nets self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target.eval() else: # Q-Networks - Local, Target Neural Nets self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target.eval() # Use optimizer to update the "local" neural net self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if self.prioritized_replay: prioritized_params = { 'a': 0.6, 'b': 0.4, 'b_inc_rate': 1.001, 'e': 0.01 } self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, prioritized_params) else: # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def act(self, state, eps=0.): """ Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection eval() -notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode. no_grad() - impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop. """ # Process state to a GPU tensor, increases dimension on x-axis (dim=0) state = torch.from_numpy(state).float() state = state.unsqueeze(0).to(device) self.qnetwork_local.eval() # Evaluation Mode with torch.no_grad(): # No Gradient Descent # Returns vector of action values action_values = self.qnetwork_local.foward(state) # Epsilon-greedy action selection rand_from_0_to_1 = random.random() if rand_from_0_to_1 > eps: greedy_action_to_cpu = action_values.cpu().data.numpy() action = np.argmax(greedy_action_to_cpu) # get max value index else: action = random.choice(np.arange(self.action_size)) self.qnetwork_local.train() # Back to train mode return int(action.item()) def step(self, state, action, reward, next_state, done): """ Process a step from time step t to t+1 by updating agent models. Params ====== state (continious): current state before action action (discrete): action take for given state reward (int): reward recieved after performing action next_state (int): state achieved at timestep t+1 done (bool): episode completed on this timestep """ if self.prioritized_replay: models = { 'local': self.qnetwork_local, 'target': self.qnetwork_target, 'GAMMA': GAMMA } self.memory.add(state, action, reward, next_state, done, models) else: self.memory.add(state, action, reward, next_state, done) # Increase counter until we are ready to take an update step self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # Note: These tensors make up a batch of 64 experiences # Unpack experience batch if self.prioritized_replay: states, actions, rewards, next_states, dones, update_factors = experiences else: states, actions, rewards, next_states, dones = experiences ########################### # Double DQN Modification # ########################### if self.double_DQN: # Get max predicted Q values (for next states) from local model q_prime = self.qnetwork_local.foward(next_states) # For the batch, we want to store the most greedy action for Double DQN Networks greedy_action_next = q_prime.max(dim=1, keepdim=True)[1] # Choose the reward from action that gives max return q_prime = q_prime.detach().max(1)[0].unsqueeze(1) # For DDQN, we must run the perform a sanity check using both nets, # while still taking the original greedy actions DDQN_q_prime = self.qnetwork_target.foward(next_states) DDQN_q_prime = DDQN_q_prime.gather(1, greedy_action_next) not_done_bool = (1 - dones ) # If done, no need to include next return td_target = rewards + (gamma * DDQN_q_prime) * not_done_bool ################# # Standard DQN # ################# else: # Get max predicted Q values (for next states) from target model q_prime = self.qnetwork_target.foward(next_states) # Choose the reward from action that gives max return q_prime = q_prime.detach().max(1)[0].unsqueeze(1) not_done_bool = (1 - dones ) # If done, no need to include next return td_target = rewards + (gamma * q_prime) * not_done_bool # This is the model we will update q_expected = self.qnetwork_local.foward(states) # Gathers the expected values for each action q_expected = q_expected.gather(1, actions) if self.prioritized_replay: q_expected *= update_factors td_target *= update_factors # Compute the loss, minimize the loss loss = F.mse_loss(td_target, q_expected) self.optimizer.zero_grad() # reset gradient loss.backward() # Calculate the gradient self.optimizer.step() # Update weights # Update the target model parameters (Soft Update) # Soft Update: Factor in local parameter changes by a factor of TAU # Rather than update for every C steps, this helps inch closer to local parameters for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): upd_wghts = ((1.0 - self.tau) * target_param.data) + (self.tau * local_param.data) target_param.data.copy_(upd_wghts)
class ExperimentSetup(): def __init__(self, algorithm, env_name, sess, random_seed): self.algorithm = algorithm self.sess = sess self.ep_ave_max_q = 0 self.env = ThrowEnvWrapper(make(env_name, reward_type='dense')) self.env.seed(random_seed) self.dmp_trajectory = None def setup_experiment(self, args): if 'ppo' in self.algorithm: self.setup_ppo(args) if 'dmp' in self.algorithm: self.setup_dmp(args) if 'ddpg' in self.algorithm: self.setup_ddpg(args) def setup_ppo(self, args=None): sess = self.sess # TODO: Use same timesteps as in dmp or take them from args self.timesteps = 100 print('INFO: ----------Setup PPO') # TODO: maybe pass dmp args def setup_dmp(self, args=None): # 1-dimensional since joint can only move in one axis -> up/down axis self.dmp_trajectory = np.array([[0.0, 0.0, 0.0], [0.0, -.15, .15]]) y_des = np.array(self.dmp_trajectory).T y_des -= y_des[:, 0][:, None] self.dmp = pydmps.dmp_discrete.DMPs_discrete(n_dmps=2, n_bfs=200, ay=np.ones(2) * 10.0) self.dmp.imitate_path(y_des=y_des) self.dmp.timesteps = int(self.dmp.timesteps / 5) def setup_ddpg(self, args): sess = self.sess tf.set_random_seed(int(args['random_seed'])) # Fetch environment state and action space properties state_dim = self.env.observation_space["observation"].shape[0] action_dim = self.env.action_space.shape[0] action_bound = self.env.action_space.high # Ensure action bound is symmetric assert (all(self.env.action_space.high - self.env.action_space.low)) self.actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) self.critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), self.actor.get_num_trainable_vars()) self.actor_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(action_dim)) # Set up summary Ops self.summary_ops, self.summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Initialize target network weights self.actor.update_target_network() self.critic.update_target_network() # Initialize replay memory self.replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. tflearn.is_training(True) def update_replay_buffer(self, state, action, next_state, reward, terminal): r_state = np.reshape(state, (self.actor.s_dim, )) r_action = np.reshape(action, (self.actor.a_dim, )) r_next_state = np.reshape(next_state, (self.actor.s_dim, )) self.replay_buffer.add(r_state, r_action, reward, terminal, r_next_state) def learn_ddpg_minibatch(self, args): # Keep adding experience to the memory until there are at least minibatch size samples if self.replay_buffer.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, t_batch, s2_batch = \ self.replay_buffer.sample_batch(int(args['minibatch_size'])) # Calculate targets target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network()
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 * (self.action_range) self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters (CartPole) # self.gamma = 0.99 # discount factor # self.tau = 0.01 # for soft update of target parameters # Algorithm parameters (Quadcopter) self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state, enable_exploration): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] noise = np.zeros(self.action_size) if (enable_exploration): noise = self.noise.sample() return list(action + noise) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def load_model(self, actor_filename, critic_filename): self.actor_local.load_model(actor_filename) self.critic_local.load_model(critic_filename) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) def save_model(self, actor_filename, critic_filename): self.actor_local.save_model(actor_filename) self.critic_local.save_model(critic_filename)
def train(sess, env, actor, critic, actor_noise, buffer_size, min_batch, ep): sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(buffer_size, 0) max_episodes = ep max_steps = 3000 score_list = [] for i in range(max_episodes): state = env.reset() score = 0 for j in range(max_steps): # env.render() action = actor.predict(np.reshape(state, (1, actor.s_dim))) + actor_noise() next_state, reward, done, info = env.step(action[0]) replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward, done, np.reshape(next_state, (actor.s_dim,))) # updating the network in batch if replay_buffer.size() < min_batch: continue states, actions, rewards, dones, next_states = replay_buffer.sample_batch(min_batch) target_q = critic.predict_target(next_states, actor.predict_target(next_states)) y = [] for k in range(min_batch): y.append(rewards[k] + critic.gamma * target_q[k] * (1-dones[k])) # Update the critic given the targets predicted_q_value, _ = critic.train(states, actions, np.reshape(y, (min_batch, 1))) # Update the actor policy using the sampled gradient a_outs = actor.predict(states) grads = critic.action_gradients(states, a_outs) actor.train(states, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state score += reward if done: print('Reward: {} | Episode: {}/{}'.format(int(score), i, max_episodes)) break score_list.append(score) avg = np.mean(score_list[-100:]) print("Average of last 100 episodes: {0:.2f} \n".format(avg)) if avg > 200: print('Task Completed') break return score_list