Beispiel #1
0
    def explore(self, actor_critic):
        """
        Explore an environment by taking a sequence of actions and saving the results in the memory.

        Parameters
        ----------
        actor_critic : ActorCritic
            The actor-critic model to use to explore.
        """
        state = torch.FloatTensor(self.env.env.state)
        trajectory = []
        for step in range(MAX_STEPS_BEFORE_UPDATE):
            action_probabilities, *_ = actor_critic(Variable(state))
            action = action_probabilities.multinomial()
            action = action.data
            exploration_statistics = action_probabilities.data.view(1, -1)
            next_state, reward, done, _ = self.env.step(action.numpy()[0])
            next_state = torch.from_numpy(next_state).float()
            if self.render:
                self.env.render()
            transition = replay_memory.Transition(
                states=state.view(1, -1),
                actions=action.view(1, -1),
                rewards=torch.FloatTensor([[reward]]),
                next_states=next_state.view(1, -1),
                done=torch.FloatTensor([[done]]),
                exploration_statistics=exploration_statistics)
            self.buffer.add(transition)
            trajectory.append(transition)
            if done:
                self.env.reset()
                break
            else:
                state = next_state
        return trajectory
Beispiel #2
0
    def explore(self, actor_critic, noise_ratio=0.):
        """
        Explore an environment by taking a sequence of actions and saving the results in the memory.

        Parameters
        ----------
        actor_critic : ActorCritic
            The actor-critic model to use to explore.
        noise_ratio : float in [0, 1], optional
            What fraction of the action should be exploration noise?
        """
        #state = torch.FloatTensor(self.env.env.state)
        if (self.m_state is None):
            self.m_state = self.env.reset()
        state = torch.FloatTensor(self.m_state)
        trajectory = []
        for step in range(MAX_STEPS_BEFORE_UPDATE):
            policy_mean, *_ = actor_critic(Variable(state))
            policy_logsd = actor_critic.policy_logsd
            action = torch.normal(policy_mean.data,
                                  torch.exp(policy_logsd.data))

            noise_mean, noise_sd = self.noise.sampling_parameters()
            noise = torch.from_numpy(self.noise.sample()).float()
            action = noise_ratio * noise + (1. - noise_ratio) * action
            sampling_mean = noise_ratio * torch.from_numpy(
                noise_mean).float() + (1. - noise_ratio) * policy_mean.data
            sampling_logsd = 0.5 * torch.log(
                noise_ratio**2 * torch.from_numpy(noise_sd).float().pow(2) +
                (1. - noise_ratio)**2 * torch.exp(2 * policy_logsd.data))
            exploration_statistics = torch.cat(
                [sampling_mean.view(1, -1),
                 sampling_logsd.view(1, -1)], dim=1)

            scaled_action = float(self.env.action_space.low[0]) \
                            + float(self.env.action_space.high[0] - self.env.action_space.low[0]) * torch.sigmoid(action)
            next_state, reward, done, _ = self.env.step(scaled_action.numpy())
            next_state = torch.from_numpy(next_state).float()
            if self.render:
                self.env.render()
            transition = replay_memory.Transition(
                states=state.view(1, -1),
                actions=action.view(1, -1),
                rewards=torch.FloatTensor([[reward]]),
                next_states=next_state.view(1, -1),
                done=torch.FloatTensor([[done]]),
                exploration_statistics=exploration_statistics)
            self.buffer.add(transition)
            trajectory.append(transition)
            if done:
                self.m_state = self.env.reset()
                break
            else:
                state = next_state
        return trajectory
Beispiel #3
0
    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = replay_memory.Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
    def optimize_model(self):
        if self.test_mode:
            print("Testing Mode")
            return
        if len(self.memory) < self.batch_size:
            print("Skipping:", len(self.memory))
            return

        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = replay_memory.Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s.view(1, -1) for s in batch.next_state if s is not None], dim=0)
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward).float()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net.forward(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).view(-1, self.n_actions).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        # Compute Huber loss
        loss = F.mse_loss(state_action_values,
                          expected_state_action_values.unsqueeze(1))
        if self.loss_count == 0:
            self.loss_graph.append(loss)
            if len(self.rewards_cache) > 10:
                self.rewards_graph.append(np.mean(np.array(
                    self.rewards_cache)))
                self.rewards_cache = []
            self.saveGraph()
            self.saveModel('/home/alvin/Desktop/MRSD_ws/rl_model.pt')
        self.loss_count += 1
        if self.loss_count == 20:
            self.loss_count = 0

        if self.loss_count % self.target_update == 0:
            print("Update Target Network")
            self.updateTargetNetwork()
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        print("Optimizing")
        self.optimizer.step()