Beispiel #1
0
    def learn(self, experiences):
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(self.device)
        dones = to_tensor(experiences['done']).type(torch.int).to(self.device)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(self.device)
        assert rewards.shape == dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size, self.state_size)
        assert actions.shape == (self.batch_size, self.action_size)

        indices = None
        if hasattr(self.buffer, 'priority_update'):  # When using PER buffer
            indices = experiences['index']
        loss_critic = self.compute_value_loss(states, actions, next_states, rewards, dones, indices)

        # Value (critic) optimization
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        nn.utils.clip_grad_norm_(self.actor_params, self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.item())

        # Policy (actor) optimization
        loss_actor = self.compute_policy_loss(states)
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self._loss_actor = float(loss_actor.item())

        # Networks gradual sync
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
Beispiel #2
0
    def step(self, state, action, reward, next_state, done) -> None:
        """Letting the agent to take a step.

        On some steps the agent will initiate learning step. This is dependent on
        the `update_freq` value.

        Parameters:
            state: S(t)
            action: A(t)
            reward: R(t)
            nexxt_state: S(t+1)
            done: (bool) Whether the state is terminal.

        """
        self.iteration += 1
        state = to_tensor(self.state_transform(state)).float().to("cpu")
        next_state = to_tensor(self.state_transform(next_state)).float().to("cpu")
        reward = self.reward_transform(reward)

        # Delay adding to buffer to account for n_steps (particularly the reward)
        self.n_buffer.add(state=state.numpy(), action=[int(action)], reward=[reward], done=[done], next_state=next_state.numpy())
        if not self.n_buffer.available:
            return

        self.buffer.add(**self.n_buffer.get().get_dict())

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) >= self.batch_size and (self.iteration % self.update_freq) == 0:
            for _ in range(self.number_updates):
                self.learn(self.buffer.sample())

            # Update networks only once - sync local & target
            soft_update(self.target_net, self.net, self.tau)
Beispiel #3
0
    def learn(self, experiences: Dict[str, List]) -> None:
        """
        Parameters:
            experiences: Contains all experiences for the agent. Typically sampled from the memory buffer.
                Five keys are expected, i.e. `state`, `action`, `reward`, `next_state`, `done`.
                Each key contains a array and all arrays have to have the same length.

        """
        rewards = to_tensor(experiences['reward']).float().to(self.device)
        dones = to_tensor(experiences['done']).type(torch.int).to(self.device)
        states = to_tensor(experiences['state']).float().to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(self.device)
        actions = to_tensor(experiences['action']).type(torch.long).to(self.device)
        assert rewards.shape == dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size, self.state_size)
        assert actions.shape == (self.batch_size, 1)  # Discrete domain

        with torch.no_grad():
            prob_next = self.target_net.act(next_states)
            q_next = (prob_next * self.z_atoms).sum(-1) * self.z_delta
            if self.using_double_q:
                duel_prob_next = self.net.act(next_states)
                a_next = torch.argmax((duel_prob_next * self.z_atoms).sum(-1), dim=-1)
            else:
                a_next = torch.argmax(q_next, dim=-1)

            prob_next = prob_next[self.__batch_indices, a_next, :]

        m = self.net.dist_projection(rewards, 1 - dones, self.gamma ** self.n_steps, prob_next)
        assert m.shape == (self.batch_size, self.num_atoms)

        log_prob = self.net(states, log_prob=True)
        assert log_prob.shape == (self.batch_size, self.action_size, self.num_atoms)
        log_prob = log_prob[self.__batch_indices, actions.squeeze(), :]
        assert log_prob.shape == m.shape == (self.batch_size, self.num_atoms)

        # Cross-entropy loss error and the loss is batch mean
        error = -torch.sum(m * log_prob, 1)
        assert error.shape == (self.batch_size,)
        loss = error.mean()
        assert loss >= 0

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
        self.optimizer.step()
        self._loss = float(loss.item())

        if hasattr(self.buffer, 'priority_update'):
            assert (~torch.isnan(error)).any()
            self.buffer.priority_update(experiences['index'], error.detach().cpu().numpy())

        # Update networks - sync local & target
        soft_update(self.target_net, self.net, self.tau)
Beispiel #4
0
def test_to_tensor_numpy():
    # Assign
    int_l = np.array([0, 1, 2, 3])
    float_l = np.random.random(4)
    float_l = np.random.random(4)

    # Act
    int_t = to_tensor(int_l)
    float_t = to_tensor(float_l)

    # Assert
    assert torch.equal(torch.tensor(int_l), int_t)
    assert torch.equal(torch.tensor(float_l), float_t)
Beispiel #5
0
    def learn(self, experiences, agent_name: str) -> None:
        """update the critics and actors of all the agents """

        # TODO: Just look at this mess.
        agent_number = list(self.agents).index(agent_name)
        agent_rewards = to_tensor(experiences['reward']).select(1, agent_number).unsqueeze(-1).float().to(self.device)
        agent_dones = to_tensor(experiences['done']).select(1, agent_number).unsqueeze(-1).type(torch.int).to(self.device)
        states = to_tensor(experiences['state']).to(self.device).view(self.batch_size, self.num_agents, self.state_size)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(self.device).view(self.batch_size, self.num_agents, self.state_size)
        flat_states = states.view(-1, self.num_agents*self.state_size)
        flat_next_states = next_states.view(-1, self.num_agents*self.state_size)
        flat_actions = actions.view(-1, self.num_agents*self.action_size)
        assert agent_rewards.shape == agent_dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size, self.num_agents, self.state_size)
        assert actions.shape == (self.batch_size, self.num_agents, self.action_size)
        assert flat_actions.shape == (self.batch_size, self.num_agents*self.action_size)

        agent = self.agents[agent_name]

        next_actions = actions.detach().clone()
        next_actions.data[:, agent_number] = agent.target_actor(next_states[:, agent_number, :])
        assert next_actions.shape == (self.batch_size, self.num_agents, self.action_size)

        # critic loss
        Q_target_next = self.target_critic(flat_next_states, self.__flatten_actions(next_actions))
        Q_target = agent_rewards + (self.gamma * Q_target_next * (1 - agent_dones))
        Q_expected = self.critic(flat_states, flat_actions)
        loss_critic = F.mse_loss(Q_expected, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        if self.gradient_clip:
            nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.mean().item())

        # Compute actor loss
        pred_actions = actions.detach().clone()
        # pred_actions.data[:, agent_number] = agent.actor(flat_states)
        pred_actions.data[:, agent_number] = agent.actor(states[:, agent_number, :])

        loss_actor = -self.critic(flat_states, self.__flatten_actions(pred_actions)).mean()
        agent.actor_optimizer.zero_grad()
        loss_actor.backward()
        agent.actor_optimizer.step()
        self._loss_actor[agent_name] = loss_actor.mean().item()
Beispiel #6
0
    def act(self, state, epsilon: float = 0.):
        actions = []
        logprobs = []
        values = []
        state = to_tensor(state).view(self.num_workers,
                                      self.state_size).float().to(self.device)
        for worker in range(self.num_workers):
            actor_est = self.actor.act(state[worker].unsqueeze(0))
            assert not torch.any(torch.isnan(actor_est))

            dist = self.policy(actor_est)
            action = dist.sample()
            value = self.critic.act(
                state[worker].unsqueeze(0))  # Shape: (1, 1)
            logprob = self.policy.log_prob(dist, action)  # Shape: (1,)
            values.append(value)
            logprobs.append(logprob)

            if self.is_discrete:  # *Technically* it's the max of Softmax but that's monotonic.
                action = int(torch.argmax(action))
            else:
                action = torch.clamp(action * self.action_scale,
                                     self.action_min, self.action_max)
                action = action.cpu().numpy().flatten().tolist()
            actions.append(action)

        self.local_memory_buffer['value'] = torch.cat(values)
        self.local_memory_buffer['logprob'] = torch.stack(logprobs)
        assert len(actions) == self.num_workers
        return actions if self.num_workers > 1 else actions[0]
Beispiel #7
0
 def target_act(self, staten, noise: float = 0.0):
     with torch.no_grad():
         staten = to_tensor(staten).float().to(self.device)
         action = self.target_actor(staten) + noise * self.noise.sample()
         return torch.clamp(action, self.action_min,
                            self.action_max).cpu().numpy().astype(
                                np.float32)
Beispiel #8
0
    def act(self, state, epsilon: float = 0.) -> List[float]:
        """
        Returns actions for given state as per current policy.

        Parameters:
            state: Current available state from the environment.
            epislon: Epsilon value in the epislon-greedy policy.

        """
        actions = []
        state = to_tensor(state).view(self.num_workers,
                                      self.state_size).float().to(self.device)
        for worker in range(self.num_workers):
            if self._rng.random() < epsilon:
                action = self.action_scale * (torch.rand(self.action_size) -
                                              0.5)
            else:
                action_seed = self.actor.act(state[worker].view(1, -1))
                action_dist = self.policy(action_seed)
                action = action_dist.sample()
                action *= self.action_scale
                action = torch.clamp(action.squeeze(), self.action_min,
                                     self.action_max).cpu()
            actions.append(action.tolist())

        assert len(actions) == self.num_workers
        return actions
Beispiel #9
0
    def act(self, state, epsilon: float = 0.0) -> List[float]:
        """
        Returns actions for given state as per current policy.

        Parameters:
            state: Current available state from the environment.
            epislon: Epsilon value in the epislon-greedy policy.

        """
        state = to_tensor(state).float().to(self.device)
        if self._rng.random() < epsilon:
            action = self.action_scale * (torch.rand(self.action_size) - 0.5)

        else:
            action_seed = self.actor.act(state).view(1, -1)
            action_dist = self.policy(action_seed)
            action = action_dist.sample()
            action *= self.action_scale
            action = action.squeeze()

        # Purely for logging
        self._display_dist = self.target_critic.act(
            state, action.to(self.device)).squeeze().cpu()
        self._display_dist = F.softmax(self._display_dist, dim=0)

        return torch.clamp(action, self.action_min,
                           self.action_max).cpu().tolist()
Beispiel #10
0
def test_to_tensor_list_tesors():
    # Assign
    int_l = [torch.tensor([0, 1, 2, 3]), torch.tensor([1, 2, 5, 10])]
    float_l = [
        torch.tensor([0.5, 1.1, 2.9, 3.0]),
        torch.tensor([0.1, 0.1, 0.1, 10.0])
    ]

    # Act
    int_t = to_tensor(int_l)
    float_t = to_tensor(float_l)

    # Assert
    assert torch.equal(torch.stack(int_l), int_t)
    assert torch.equal(torch.stack(float_l), float_t)

    assert int_t.shape == (2, 4)
    assert float_t.shape == (2, 4)
Beispiel #11
0
def test_to_tensor_tensor():
    # Assign
    t = torch.tensor([0, 1, 2, 3])

    # Act
    new_t = to_tensor(t)

    # Assert
    assert torch.equal(new_t, t)
Beispiel #12
0
def test_to_tensor_list():
    # Assign
    int_l = [0, 1, 2, 3]
    float_l = [0.5, 1.1, 2.9, 3.0]
    int_l_shape = [list(range(4 * i, 4 * (i + 1))) for i in range(5)]

    # Act
    int_t = to_tensor(int_l)
    float_t = to_tensor(float_l)
    int_t_shape = to_tensor(int_l_shape)

    # Assert
    assert torch.equal(torch.tensor(int_l), int_t)
    assert torch.equal(torch.tensor(float_l), float_t)
    assert torch.equal(torch.tensor(int_l_shape), int_t_shape)

    assert int_t.shape == (4, )
    assert float_t.shape == (4, )
    assert int_t_shape.shape == (5, 4)
Beispiel #13
0
    def learn(self, experiences):
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(
            self.device).unsqueeze(1)
        dones = to_tensor(experiences['done']).type(torch.int).to(
            self.device).unsqueeze(1)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(
            self.device)

        if (self.iteration % self.update_freq) == 0:
            self._update_value_function(states, actions, rewards, next_states,
                                        dones)

        if (self.iteration % self.update_policy_freq) == 0:
            self._update_policy(states)

            soft_update(self.target_actor, self.actor, self.tau)
            soft_update(self.target_critic, self.critic, self.tau)
Beispiel #14
0
    def act(self, obs, noise: float = 0.0) -> List[float]:
        """Acting on the observations. Returns action.

        Returns:
            action: (list float) Action values.
        """
        obs = to_tensor(obs).float().to(self.device)
        action = self.actor(obs)
        action += noise * self.noise.sample()
        action = torch.clamp(action * self.action_scale, self.action_min,
                             self.action_max)
        return action.cpu().numpy().tolist()
Beispiel #15
0
    def learn(self, samples):
        """update the critics and actors of all the agents """

        rewards = to_tensor(samples['reward']).float().to(self.device).view(
            self.batch_size, 1)
        dones = to_tensor(samples['done']).int().to(self.device).view(
            self.batch_size, 1)
        states = to_tensor(samples['state']).float().to(self.device).view(
            self.batch_size, self.state_size)
        next_states = to_tensor(samples['next_state']).float().to(
            self.device).view(self.batch_size, self.state_size)
        actions = to_tensor(samples['action']).to(self.device).view(
            self.batch_size, self.action_size)

        # Critic (value) update
        for _ in range(self.critic_number_updates):
            value_loss, error = self.compute_value_loss(
                states, actions, rewards, next_states, dones)
            self.critic_optimizer.zero_grad()
            value_loss.backward()
            nn.utils.clip_grad_norm_(self.critic_params,
                                     self.max_grad_norm_critic)
            self.critic_optimizer.step()
            self._loss_critic = value_loss.item()

        # Actor (policy) update
        for _ in range(self.actor_number_updates):
            policy_loss = self.compute_policy_loss(states)
            self.actor_optimizer.zero_grad()
            policy_loss.backward()
            nn.utils.clip_grad_norm_(self.actor_params,
                                     self.max_grad_norm_actor)
            self.actor_optimizer.step()
            self._loss_actor = policy_loss.item()

        if hasattr(self.memory, 'priority_update'):
            assert any(~torch.isnan(error))
            self.memory.priority_update(samples['index'], error.abs())

        soft_update(self.target_double_critic, self.double_critic, self.tau)
Beispiel #16
0
    def train(self):
        """
        Main loop that initiates the training.
        """
        experiences = self.buffer.all_samples()
        rewards = to_tensor(experiences['reward']).to(self.device)
        dones = to_tensor(experiences['done']).type(torch.int).to(self.device)
        states = to_tensor(experiences['state']).to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        values = to_tensor(experiences['value']).to(self.device)
        logprobs = to_tensor(experiences['logprob']).to(self.device)
        assert rewards.shape == dones.shape == values.shape == logprobs.shape
        assert states.shape == (
            self.rollout_length, self.num_workers,
            self.state_size), f"Wrong states shape: {states.shape}"
        assert actions.shape == (
            self.rollout_length, self.num_workers,
            self.action_size), f"Wrong action shape: {actions.shape}"

        with torch.no_grad():
            if self.using_gae:
                next_value = self.critic.act(states[-1])
                advantages = compute_gae(rewards, dones, values, next_value,
                                         self.gamma, self.gae_lambda)
                advantages = normalize(advantages)
                returns = advantages + values
                # returns = normalize(advantages + values)
                assert advantages.shape == returns.shape == values.shape
            else:
                returns = revert_norm_returns(rewards, dones, self.gamma)
                returns = returns.float()
                advantages = normalize(returns - values)
                assert advantages.shape == returns.shape == values.shape

        for _ in range(self.num_epochs):
            idx = 0
            self.kl_div = 0
            while idx < self.rollout_length:
                _states = states[idx:idx + self.batch_size].view(
                    -1, self.state_size).detach()
                _actions = actions[idx:idx + self.batch_size].view(
                    -1, self.action_size).detach()
                _logprobs = logprobs[idx:idx + self.batch_size].view(
                    -1, 1).detach()
                _returns = returns[idx:idx + self.batch_size].view(-1,
                                                                   1).detach()
                _advantages = advantages[idx:idx + self.batch_size].view(
                    -1, 1).detach()
                idx += self.batch_size
                self.learn(
                    (_states, _actions, _logprobs, _returns, _advantages))

            self.kl_div = abs(
                self.kl_div) / (self.actor_number_updates *
                                self.rollout_length / self.batch_size)
            if self.kl_div > self.target_kl * 1.75:
                self.kl_beta = min(2 * self.kl_beta, 1e2)  # Max 100
            if self.kl_div < self.target_kl / 1.75:
                self.kl_beta = max(0.5 * self.kl_beta, 1e-6)  # Min 0.000001
            self._metrics['policy/kl_beta'] = self.kl_beta
Beispiel #17
0
    def learn(self, experiences: Dict[str, list]) -> None:
        """Updates agent's networks based on provided experience.

        Parameters:
            experiences: Samples experiences from the experience buffer.

        """
        rewards = to_tensor(experiences['reward']).type(torch.float32).to(
            self.device)
        dones = to_tensor(experiences['done']).type(torch.int).to(self.device)
        states = to_tensor(experiences['state']).type(torch.float32).to(
            self.device)
        next_states = to_tensor(experiences['next_state']).type(
            torch.float32).to(self.device)
        actions = to_tensor(experiences['action']).type(torch.long).to(
            self.device)

        with torch.no_grad():
            Q_targets_next = self.target_net.act(next_states).detach()
            if self.using_double_q:
                _a = torch.argmax(self.net(next_states), dim=-1).unsqueeze(-1)
                max_Q_targets_next = Q_targets_next.gather(1, _a)
            else:
                max_Q_targets_next = Q_targets_next.max(1)[0].unsqueeze(1)
        Q_targets = rewards + self.n_buffer.n_gammas[
            -1] * max_Q_targets_next * (1 - dones)
        Q_expected: torch.Tensor = self.net(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
        self.optimizer.step()
        self._loss = float(loss.item())

        if hasattr(self.buffer, 'priority_update'):
            error = Q_expected - Q_targets
            assert any(~torch.isnan(error))
            self.buffer.priority_update(experiences['index'], error.abs())
Beispiel #18
0
    def learn(self, experiences) -> None:
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(
            self.device).unsqueeze(1)
        dones = to_tensor(experiences['done']).type(torch.int).to(
            self.device).unsqueeze(1)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(
            self.device)
        assert rewards.shape == dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size,
                                                     self.state_size)
        assert actions.shape == (self.batch_size, self.action_size)

        # Value (critic) optimization
        loss_critic = self.compute_value_loss(states, actions, next_states,
                                              rewards, dones)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(),
                                 self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.item())

        # Policy (actor) optimization
        loss_actor = self.compute_policy_loss(states)
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(),
                                 self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self._loss_actor = loss_actor.item()

        # Soft update target weights
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
Beispiel #19
0
    def act(self,
            state,
            epsilon: float = 0.0,
            deterministic=False) -> List[float]:
        if self.iteration < self.warm_up or self._rng.random() < epsilon:
            random_action = torch.rand(self.action_size) * (
                self.action_max + self.action_min) + self.action_min
            return random_action.cpu().tolist()

        state = to_tensor(state).view(1,
                                      self.state_size).float().to(self.device)
        proto_action = self.actor(state)
        action = self.policy(proto_action, deterministic)

        return action.flatten().tolist()
Beispiel #20
0
    def act(self, state, eps: float = 0.) -> int:
        """
        Returns actions for given state as per current policy.

        Parameters:
            state: Current available state from the environment.
            epislon: Epsilon value in the epislon-greedy policy.

        """
        # Epsilon-greedy action selection
        if self._rng.random() < eps:
            return self._rng.randint(0, self.action_size-1)

        state = to_tensor(self.state_transform(state)).float().unsqueeze(0).to(self.device)
        # state = to_tensor(self.state_transform(state)).float().to(self.device)
        self.dist_probs = self.net.act(state)
        q_values = (self.dist_probs * self.z_atoms).sum(-1)
        return int(q_values.argmax(-1))  # Action maximizes state-action value Q(s, a)
Beispiel #21
0
    def act(self, state, eps: float = 0.) -> int:
        """Returns actions for given state as per current policy.

        Parameters:
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection

        Returns:
            Categorical value for the action.

        """
        # Epsilon-greedy action selection
        if self._rng.random() < eps:
            return self._rng.randint(0, self.action_size - 1)

        state = to_tensor(self.state_transform(state)).float()
        state = state.unsqueeze(0).to(self.device)
        action_values = self.net.act(state)
        return int(torch.argmax(action_values.cpu()))
Beispiel #22
0
    def act(self,
            state,
            epsilon: float = 0.0,
            training_mode=True) -> List[float]:
        """
        Agent acting on observations.

        When the training_mode is True (default) a noise is added to each action.
        """
        # Epsilon greedy
        if self._rng.random() < epsilon:
            rnd_actions = torch.rand(self.action_size) * (
                self.action_max - self.action_min) - self.action_min
            return rnd_actions.tolist()

        with torch.no_grad():
            state = to_tensor(state).float().to(self.device)
            action = self.actor(state)
            if training_mode:
                action += self.noise.sample()
            return (self.action_scale * torch.clamp(action, self.action_min,
                                                    self.action_max)).tolist()