def __init__(self, state_size, action_size, num_agents): """Create an instance of Agent :param state_size: state vector dimension :param action_size: action vector dimension""" random_seed = 5 self.__step_counter = 0 self.__eps = EPS_START self.actor_local = Actor(state_size, action_size, random_seed).to(DEVICE) self.__actor_target = Actor(state_size, action_size, random_seed + 1).to(DEVICE) self.__actor_optimizer = optim.Adam(self.actor_local.parameters()) self.critic_local = Critic(state_size, action_size, random_seed + 2).to(DEVICE) self.__critic_target = Critic(state_size, action_size, random_seed + 3).to(DEVICE) self.__critic_optimizer = optim.Adam(self.critic_local.parameters()) self.__memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed + 4) # Noise process self.__noises = [ OUNoise(action_size, random_seed + i) for i in range(num_agents) ]
def test_actor_single_input(self): """Test actor forward() against a single state vector""" actor = Actor(state_size=5, action_size=3, seed=0).to(DEVICE) state = torch.Tensor([[0.1, 0.5, 1.0, 0.1, 0.5]]).to(DEVICE) actor.eval() action = actor.forward(state).to(DEVICE) self.assertEqual((1, 3), action.size())
def test_actor_multiple_input(self): """Test actor forward() against multiple state vectors""" actor = Actor(state_size=3, action_size=2, seed=0).to(DEVICE) states = torch.Tensor([[0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0]]).to(DEVICE) actor.eval() actions = actor.forward(states).to(DEVICE) self.assertEqual((5, 2), actions.size())
def __init__(self, max_velocity, state_size, action_size): """Creates an agent to train and test its multiple copies :param max_velocity: maximum velocity of the agent :param state_size: dimensionality of the state vector :param action_size: dimensionality of the action vector""" self.__max_velocity = max_velocity self.actor_local = Actor(state_size, action_size, 0) self.critic_local = Critic(state_size, action_size, 0) self.steps = [] self.reset_calls = 0
class Agent: """Policy gradient agent to train and act in a distributed environment""" # pylint: disable=no-member, too-many-instance-attributes def __init__(self, state_size, action_size, num_agents): """Create an instance of Agent :param state_size: state vector dimension :param action_size: action vector dimension""" random_seed = 5 self.__step_counter = 0 self.__eps = EPS_START self.actor_local = Actor(state_size, action_size, random_seed).to(DEVICE) self.__actor_target = Actor(state_size, action_size, random_seed + 1).to(DEVICE) self.__actor_optimizer = optim.Adam(self.actor_local.parameters()) self.critic_local = Critic(state_size, action_size, random_seed + 2).to(DEVICE) self.__critic_target = Critic(state_size, action_size, random_seed + 3).to(DEVICE) self.__critic_optimizer = optim.Adam(self.critic_local.parameters()) self.__memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed + 4) # Noise process self.__noises = [ OUNoise(action_size, random_seed + i) for i in range(num_agents) ] def reset(self): """The method is called in the beginning of each episode""" for noise in self.__noises: noise.reset() def step(self, states, actions, env_info): """Performs a training step :param states: current states of environments :param actions: actions which were taken by the agent upon states. :param env_info: Info of agent states after applying actions """ # Save experiences / rewards self.__memory.add(states, actions, env_info) self.__step_counter += 1 # Learn, if enough samples are available in memory if len(self.__memory) > BATCH_SIZE and 0 == self.__step_counter % 2: experiences = self.__memory.sample() self.__learn(experiences, GAMMA) def act(self, states, add_noise): """Calculates action vectors from state vectors for multiple environments :param states: state vectors from multiple environments :param add_noise: if True, adds noise vector :return: action vectors for multiple environments""" torch_states = torch.from_numpy(states).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(torch_states).cpu().data.numpy() self.actor_local.train() if add_noise: for action, noise in zip(actions, self.__noises): action += self.__eps * noise.sample() self.__eps = max(EPS_END, EPS_DECAY * self.__eps) return np.clip(actions, -1.0, 1.0) def __learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor""" states, actions, rewards, next_states, dones = experiences # Update critic # Get predicted next-state actions and Q values from target models actions_next = self.__actor_target(next_states) q_targets_next = self.__critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Compute critic loss q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.__critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.__critic_optimizer.step() # Update actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.__actor_optimizer.zero_grad() actor_loss.backward() self.__actor_optimizer.step() # Update target networks _soft_update(self.critic_local, self.__critic_target, TAU) _soft_update(self.actor_local, self.__actor_target, TAU)