コード例 #1
0
    def learn(self, replay_buffer, env):
        self.it += 1

        states, actions, next_states, rewards, dones = replay_buffer.sample(self.batch_size)

        states = states.squeeze()
        actions = actions.squeeze()
        next_states = next_states.squeeze()
        rewards = rewards.squeeze()
        dones = dones.squeeze()

        if env.has_discrete_state_space():
            states = to_one_hot_encoding(states, self.state_dim)
            next_states = to_one_hot_encoding(next_states, self.state_dim)

        q_values = self.model(states)
        next_q_values = self.model(next_states)
        next_q_state_values = self.model_target(next_states)

        q_value = q_values.gather(1, actions.long().unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = rewards + self.gamma * next_q_value * (1 - dones)
        loss = F.mse_loss(q_value, expected_q_value.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # target network update
        for target_param, param in zip(self.model_target.parameters(), self.model.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)
        return loss
コード例 #2
0
 def reset(self):
     self.state = torch.tensor(self.reset_env.reset())
     if len(self.state) > self.state_dim:
         self.state = from_one_hot_encoding(self.state)
     elif len(self.state) < self.state_dim:
         self.state = to_one_hot_encoding(self.state, self.state_dim)
     return self.state
コード例 #3
0
 def select_train_action(self, state, env):
     if random.random() < self.eps:
         return env.get_random_action()
     else:
         if env.has_discrete_state_space():
             state = to_one_hot_encoding(state, self.state_dim)
         qvals = self.model(state.to(self.device))
         return  torch.argmax(qvals).unsqueeze(0).detach()
コード例 #4
0
    def step(self, action, state=None):
        if self.is_virtual_env():
            reward_sum = None

            if self.has_discrete_action_space():
                action = to_one_hot_encoding(action, self.get_action_dim())

            if state is None:
                for i in range(self.same_action_num):
                    state, reward, done = self.env.step(
                        action=action.to(self.env.device))
                    if reward_sum is None:
                        reward_sum = reward
                    else:
                        reward_sum += reward
                    # TODO: proper handling of the done flag for a batch of states/actions if same_action_num > 1
            # required for the histogram experiment
            else:
                for i in range(self.same_action_num):
                    state, reward, done = self.env.step(
                        action=action.to(self.env.device),
                        state=state.to(self.env.device))
                    if reward_sum is None:
                        reward_sum = reward
                    else:
                        reward_sum += reward
                    # TODO: proper handling of the done flag for a batch of states/actions if same_action_num > 1

            # todo: to device?
            reward = reward_sum.to("cpu")
            next_state = state.to("cpu")
            done = done.to("cpu")

            if self.has_discrete_state_space():
                next_state = from_one_hot_encoding(next_state)

            return next_state, reward, done

        else:
            action = action.cpu().detach().numpy()
            if self.has_discrete_action_space():
                action = action.astype(int)[0]

            reward_sum = 0
            for i in range(self.same_action_num):
                state, reward, done, _ = self.env.step(action)
                reward_sum += reward
                if done:
                    break

            next_state_torch = torch.tensor(state,
                                            device="cpu",
                                            dtype=torch.float32)
            reward_torch = torch.tensor(reward_sum,
                                        device="cpu",
                                        dtype=torch.float32)
            done_torch = torch.tensor(done, device="cpu", dtype=torch.float32)

            if next_state_torch.dim() == 0:
                next_state_torch = next_state_torch.unsqueeze(0)

            return next_state_torch, reward_torch, done_torch
コード例 #5
0
 def select_test_action(self, state, env):
     if env.has_discrete_state_space():
         state = to_one_hot_encoding(state, self.state_dim)
     qvals = self.model(state.to(self.device))
     return torch.argmax(qvals).unsqueeze(0).detach()
コード例 #6
0
    def _calc_reward(self, state, next_state, reward, info):
        if 'TimeLimit.truncated' in info:  # remove additional information from wrapper
            info.pop('TimeLimit.truncated')

        reward_torch = torch.tensor(reward,
                                    device=self.device,
                                    dtype=torch.float32)

        if isinstance(state, int) or len(state) < self.state_dim:
            state_torch = to_one_hot_encoding(state, self.state_dim)
            next_state_torch = to_one_hot_encoding(next_state, self.state_dim)
        else:
            state_torch = torch.tensor(state,
                                       device=self.device,
                                       dtype=torch.float32)
            next_state_torch = torch.tensor(next_state,
                                            device=self.device,
                                            dtype=torch.float32)

        if self.reward_env_type == 0:
            reward_res = reward_torch

        elif self.reward_env_type == 1:
            reward_res = self.gamma * self.reward_net(
                next_state_torch) - self.reward_net(state_torch)

        elif self.reward_env_type == 2:
            reward_res = reward_torch + self.gamma * self.reward_net(
                next_state_torch) - self.reward_net(state_torch)

        elif self.reward_env_type == 3 or self.reward_env_type == 4:
            if not info:
                raise ValueError('No info dict provided by environment')

            info_torch = torch.tensor(list(info.values()),
                                      device=self.device,
                                      dtype=torch.float32)
            input_state = torch.cat(
                (state_torch.to(self.device), info_torch.to(self.device)),
                dim=state_torch.dim() - 1)
            input_state_next = torch.cat(
                (next_state_torch.to(self.device), info_torch.to(self.device)),
                dim=state_torch.dim() - 1)

            if self.reward_env_type == 3:
                reward_res = self.gamma * self.reward_net(
                    input_state_next) - self.reward_net(input_state)
            elif self.reward_env_type == 4:
                reward_res = reward_torch + self.gamma * self.reward_net(
                    input_state_next) - self.reward_net(input_state)

        elif self.reward_env_type == 5:
            reward_res = self.reward_net(next_state_torch)

        elif self.reward_env_type == 6:
            reward_res = reward_torch + self.reward_net(next_state_torch)

        elif self.reward_env_type == 7 or self.reward_env_type == 8:
            if not info:
                raise ValueError('No info dict provided by environment')

            info_torch = torch.tensor(list(info.values()),
                                      device=self.device,
                                      dtype=torch.float32)
            input_state = torch.cat(
                (state_torch.to(self.device), info_torch.to(self.device)),
                dim=state_torch.dim() - 1)
            input_state_next = torch.cat(
                (next_state_torch.to(self.device), info_torch.to(self.device)),
                dim=state_torch.dim() - 1)

            if self.reward_env_type == 7:
                reward_res = self.reward_net(input_state_next)
            elif self.reward_env_type == 8:
                reward_res = reward_torch + self.reward_net(input_state_next)

        elif self.reward_env_type == 101 or self.reward_env_type == 102:
            if not info:
                raise ValueError('No info dict provided by environment')

            info_torch = torch.tensor(list(info.values()),
                                      device=self.device,
                                      dtype=torch.float32)

            if self.reward_env_type == 101:
                reward_res = self.reward_net(info_torch)
            elif self.reward_env_type == 102:
                reward_res = reward_torch + self.reward_net(info_torch)

        return reward_res.item()