def __init__(self, state_size, action_size, num_agents):
        """Create an instance of Agent
        :param state_size: state vector dimension
        :param action_size: action vector dimension"""

        random_seed = 5

        self.__step_counter = 0
        self.__eps = EPS_START

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(DEVICE)

        self.__actor_target = Actor(state_size, action_size,
                                    random_seed + 1).to(DEVICE)

        self.__actor_optimizer = optim.Adam(self.actor_local.parameters())

        self.critic_local = Critic(state_size, action_size,
                                   random_seed + 2).to(DEVICE)

        self.__critic_target = Critic(state_size, action_size,
                                      random_seed + 3).to(DEVICE)

        self.__critic_optimizer = optim.Adam(self.critic_local.parameters())

        self.__memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed + 4)

        # Noise process
        self.__noises = [
            OUNoise(action_size, random_seed + i) for i in range(num_agents)
        ]
Esempio n. 2
0
    def test_actor_single_input(self):
        """Test actor forward() against a single state vector"""
        actor = Actor(state_size=5, action_size=3, seed=0).to(DEVICE)

        state = torch.Tensor([[0.1, 0.5, 1.0, 0.1, 0.5]]).to(DEVICE)

        actor.eval()
        action = actor.forward(state).to(DEVICE)
        self.assertEqual((1, 3), action.size())
Esempio n. 3
0
    def test_actor_multiple_input(self):
        """Test actor forward() against multiple state vectors"""
        actor = Actor(state_size=3, action_size=2, seed=0).to(DEVICE)

        states = torch.Tensor([[0.0, 0.0, 1.0], [1.0, 0.0,
                                                 1.0], [0.0, 1.0, 1.0],
                               [0.0, 0.0, 1.0], [0.0, 1.0, 1.0]]).to(DEVICE)

        actor.eval()
        actions = actor.forward(states).to(DEVICE)
        self.assertEqual((5, 2), actions.size())
    def __init__(self, max_velocity, state_size, action_size):
        """Creates an agent to train and test its multiple copies
        :param max_velocity: maximum velocity of the agent
        :param state_size: dimensionality of the state vector
        :param action_size: dimensionality of the action vector"""

        self.__max_velocity = max_velocity
        self.actor_local = Actor(state_size, action_size, 0)
        self.critic_local = Critic(state_size, action_size, 0)
        self.steps = []
        self.reset_calls = 0
class Agent:
    """Policy gradient agent to train and act in a distributed environment"""

    # pylint: disable=no-member, too-many-instance-attributes

    def __init__(self, state_size, action_size, num_agents):
        """Create an instance of Agent
        :param state_size: state vector dimension
        :param action_size: action vector dimension"""

        random_seed = 5

        self.__step_counter = 0
        self.__eps = EPS_START

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(DEVICE)

        self.__actor_target = Actor(state_size, action_size,
                                    random_seed + 1).to(DEVICE)

        self.__actor_optimizer = optim.Adam(self.actor_local.parameters())

        self.critic_local = Critic(state_size, action_size,
                                   random_seed + 2).to(DEVICE)

        self.__critic_target = Critic(state_size, action_size,
                                      random_seed + 3).to(DEVICE)

        self.__critic_optimizer = optim.Adam(self.critic_local.parameters())

        self.__memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed + 4)

        # Noise process
        self.__noises = [
            OUNoise(action_size, random_seed + i) for i in range(num_agents)
        ]

    def reset(self):
        """The method is called in the beginning of each episode"""
        for noise in self.__noises:
            noise.reset()

    def step(self, states, actions, env_info):
        """Performs a training step
        :param states: current states of environments
        :param actions: actions which were taken by the agent upon states.
        :param env_info: Info of agent states after applying actions
        """

        # Save experiences / rewards
        self.__memory.add(states, actions, env_info)

        self.__step_counter += 1

        # Learn, if enough samples are available in memory
        if len(self.__memory) > BATCH_SIZE and 0 == self.__step_counter % 2:
            experiences = self.__memory.sample()
            self.__learn(experiences, GAMMA)

    def act(self, states, add_noise):
        """Calculates action vectors from state vectors for multiple
        environments
        :param states: state vectors from multiple environments
        :param add_noise: if True, adds noise vector
        :return: action vectors for multiple environments"""

        torch_states = torch.from_numpy(states).float().to(DEVICE)

        self.actor_local.eval()

        with torch.no_grad():
            actions = self.actor_local(torch_states).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            for action, noise in zip(actions, self.__noises):
                action += self.__eps * noise.sample()

            self.__eps = max(EPS_END, EPS_DECAY * self.__eps)

        return np.clip(actions, -1.0, 1.0)

    def __learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience
        tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done)
            tuples gamma (float): discount factor"""

        states, actions, rewards, next_states, dones = experiences

        # Update critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.__actor_target(next_states)
        q_targets_next = self.__critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))
        # Compute critic loss
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # Minimize the loss
        self.__critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.__critic_optimizer.step()

        # Update actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.__actor_optimizer.zero_grad()
        actor_loss.backward()
        self.__actor_optimizer.step()

        # Update target networks
        _soft_update(self.critic_local, self.__critic_target, TAU)
        _soft_update(self.actor_local, self.__actor_target, TAU)