예제 #1
0
 def step(self, reward, observation):
     if self.episode_counter < episodes_only_dqn:
         action = BaseDynaAgent.step(self, reward, observation)
     elif self.episode_counter < episodes_only_dqn + episodes_only_mcts:
         action = MCTSAgent.step(self, reward, observation)
     else:
         if self.episode_counter % 2 == 0:
             action = BaseDynaAgent.step(self, reward, observation)
         else:
             action = MCTSAgent.step(self, reward, observation)
     return action
예제 #2
0
    def step(self, reward, observation):
        self.time_step += 1
        if self.episode_counter % 2 == 0:

            self.state = self.getStateRepresentation(observation)
            self.action = self.policy(self.state)

            reward = torch.tensor([reward], device=self.device)

            with torch.no_grad():
                real_prev_action = self.action_list[self.prev_action.item()]
                prev_state_value = self.getStateActionValue(
                    self.prev_state, real_prev_action).item()
                state_value = self._vf['q']['network'](
                    self.state).max(1)[1].view(1, 1).item()
                td_error = reward.item(
                ) + self.gamma * state_value - prev_state_value
                self.update_average_td_error(td_error)

            # store the new transition in buffer
            self.updateTransitionBuffer(
                utils.transition(self.prev_state, self.prev_action, reward,
                                 self.state, self.action, False,
                                 self.time_step, 0))

            # update target
            if self._target_vf['counter'] >= self._target_vf['update_rate']:
                self.setTargetValueFunction(self._vf['q'], 'q')
                # self.setTargetValueFunction(self._vf['s'], 's')

            # update value function with the buffer
            if self._vf['q']['training']:
                if len(self.transition_buffer) >= self._vf['q']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 'q')
            if self._vf['s']['training']:
                if len(self.transition_buffer) >= self._vf['s']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 's')

            # train/plan with model
            self.trainModel()
            self.plan()

            self.updateStateRepresentation()

            self.prev_state = self.getStateRepresentation(observation)
            self.prev_action = self.action  # another option:** we can again call self.policy function **

            action = self.action_list[self.prev_action.item()]
        else:
            action = MCTSAgent.step(self, reward, observation)

        return action
예제 #3
0
    def step(self, reward, observation):
        if self.episode_counter % 2 == 0:
            self.time_step += 1

            self.state = self.getStateRepresentation(observation)
            self.action = self.policy(self.state)

            # update target
            if self._target_vf['counter'] >= self._target_vf['update_rate']:
                self.setTargetValueFunction(self._vf['q'], 'q')
                # self.setTargetValueFunction(self._vf['s'], 's')

            # update value function with the buffer
            if self._vf['q']['training']:
                if len(self.transition_buffer) >= self._vf['q']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 'q')
            if self._vf['s']['training']:
                if len(self.transition_buffer) >= self._vf['s']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 's')

            # train/plan with model
            self.trainModel()
            self.plan()

            self.updateStateRepresentation()

            self.prev_state = self.getStateRepresentation(observation)
            self.prev_action = self.action  # another option:** we can again call self.policy function **

            action = self.action_list[self.prev_action.item()]
        else:
            action = MCTSAgent.step(self, reward, observation)

        return action
예제 #4
0
 def step(self, reward, observation):
     action = MCTSAgent.step(self, reward, observation)
     return action
예제 #5
0
    def step(self, reward, observation):
        if self.episode_counter % 2 == 0:
            self.time_step += 1

            self.state = self.getStateRepresentation(observation)
            self.action = self.policy(self.state)

            reward = torch.tensor([reward], device=self.device)

            # store the new transition in buffer
            self.updateTransitionBuffer(
                utils.transition(self.prev_state, self.prev_action, reward,
                                 self.state, self.action, False,
                                 self.time_step, 0))

            # update target
            if self._target_vf['counter'] >= self._target_vf['update_rate']:
                self.setTargetValueFunction(self._vf['q'], 'q')
                # self.setTargetValueFunction(self._vf['s'], 's')

            # update value function with the buffer
            if self._vf['q']['training']:
                if len(self.transition_buffer) >= self._vf['q']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 'q')
            if self._vf['s']['training']:
                if len(self.transition_buffer) >= self._vf['s']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 's')

            # train/plan with model
            self.trainModel()
            self.plan()

            self.updateStateRepresentation()

            self.prev_state = self.getStateRepresentation(observation)
            self.prev_action = self.action  # another option:** we can again call self.policy function **

            action = self.action_list[self.prev_action.item()]
        else:
            action = MCTSAgent.step(self, reward, observation)
            prev_action_index = self.getActionIndex(self.mcts_prev_action)
            prev_action_torch = torch.tensor([prev_action_index],
                                             device=self.device,
                                             dtype=int).view(1, 1)
            reward = torch.tensor([reward], device=self.device).float()
            state_torch = self.getStateRepresentation(observation)
            self.updateTransitionBuffer(
                utils.transition(self.mcts_prev_state, prev_action_torch,
                                 reward, state_torch, None, False,
                                 self.time_step, 0))
            self.mcts_prev_state = state_torch
            self.mcts_prev_action = action

            # update target
            if self._target_vf['counter'] >= self._target_vf['update_rate']:
                self.setTargetValueFunction(self._vf['q'], 'q')
                # self.setTargetValueFunction(self._vf['s'], 's')

            # update value function with the buffer
            if self._vf['q']['training']:
                if len(self.transition_buffer) >= self._vf['q']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 'q')
            if self._vf['s']['training']:
                if len(self.transition_buffer) >= self._vf['s']['batch_size']:
                    transition_batch = self.getTransitionFromBuffer(
                        n=self._vf['q']['batch_size'])
                    self.updateValueFunction(transition_batch, 's')
        return action