Ejemplo n.º 1
0
    def select_action(self, observation):
        stpt_actions = []
        therm_actions = []
        blind_actions = []
        idxs = []

        for i in range(0, self.num_sat_actions):
            rand = np.random.choice(self.stpt_action_space)
            stpt_actions.append(rand)
            idxs.append(np.where(self.stpt_action_space == rand)[0].item())
        for i in range(0, self.num_therm_actions):
            rand = np.random.choice(self.therm_action_space)
            therm_actions.append(rand)
            idxs.append(np.where(self.therm_action_space == rand)[0].item())
        for i in range(0, self.num_blind_actions):
            rand = np.random.choice(self.blind_action_space)
            blind_actions.append(rand)
            idxs.append(np.where(self.blind_action_space == rand)[0].item())

        sat_actions_tups = []
        for a in stpt_actions:
            action_stpt, sat_sp = augment_ma(observation, a)
            sat_actions_tups.append((action_stpt, sat_sp))

        for i in range(len(idxs)):
            observation[1][f"Action idx {i}"] = idxs[i]

        return sat_actions_tups, therm_actions, blind_actions, idxs
    def agent_start(self, state):
        action, blind_action = self.policy_old.act(state[0], self.memory)

        self.last_action = action
        self.last_state = state
        if self.discrete:
            action = self.action_space[action]
        blind_action = self.blind_action_space[blind_action]

        action_stpt, sat_sp = augment_ma(state, action)
        return action_stpt, sat_sp, blind_action
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state, _, _ = observation

            actions = self.q_eval.forward(state)
            action_idx = T.argmax(actions).item()
            action = self.action_space[action_idx]

        else:
            action = np.random.choice(self.action_space)

        action, sat_sp = augment_ma(observation, action)

        return action, sat_sp
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            # state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
            # actions = self.q_eval.forward(state)
            # action = T.argmax(actions).item()

            state, _, _ = observation

            actions = self.q_eval.forward(state)
            action_idx = T.argmax(actions).item()
            action = self.action_space[action_idx]
        else:
            action = np.random.choice(self.action_space)

        action, sat_sp = augment_ma(observation, action)

        return action, sat_sp
    def select_action(self, observation, evaluate=False):
        """

        :param observation:
        :param evaluate:
        :return: action is for the reward function
                 sat_sp is whats used by env_step and by the model for training
        """
        actions = []
        if self.start_steps > self.total_numsteps:
            for i in range(self.action_space.shape[0]):
                a = np.random.uniform(self.action_space[i].min(),
                                      self.action_space[i].max())
                actions.append(a)

        else:
            state, _, _ = observation
            state = torch.FloatTensor(state.float()).to(
                self.device).unsqueeze(0)
            if evaluate is False:
                action, _, _ = self.policy.sample(state)
            else:
                _, _, action = self.policy.sample(state)
            for i in range(self.action_space.shape[0]):
                a = action.detach().cpu().numpy()[0][i]
                actions.append(a)

        sat_actions = actions[0:self.num_sat_actions]
        therm_actions = actions[self.num_sat_actions:self.num_therm_actions +
                                self.num_sat_actions]
        blind_actions = actions[self.num_therm_actions + self.num_sat_actions:]

        sat_actions_tups = []
        for a in sat_actions:
            action_stpt, sat_sp = augment_ma(observation, a)
            sat_actions_tups.append((action_stpt, sat_sp))
        if len(sat_actions) == 0:
            # this is hacky but makes the parsing in the main file cleaner
            sat_actions_tups.append(([], []))

        return sat_actions_tups, therm_actions, blind_actions, actions
Ejemplo n.º 6
0
    def select_action(self, observation):
        if np.random.random() > self.epsilon:
            state, _, _ = observation

            actions_stpt, actions_blinds = self.q_eval.forward(state)

            # setpoint actions
            actions_stpt_idx = T.argmax(actions_stpt).item()
            action_stpt = self.action_space[0][actions_stpt_idx]

            # blind actions
            actions_blinds_idx = T.argmax(actions_blinds).item()
            action_blinds = self.action_space[1][actions_blinds_idx]

        else:
            action_stpt = np.random.choice(self.action_space[0])
            action_blinds = np.random.choice(self.action_space[1])

        action_stpt, sat_sp = augment_ma(observation, action_stpt)

        return action_stpt, sat_sp, action_blinds
Ejemplo n.º 7
0
    def agent_start(self, state):
        action_idx = self.policy_old.act(state[0], self.memory)

        self.last_action = action_idx[:]
        self.last_state = state
        actions = list()
        for i, action in enumerate(action_idx):
            actions.append(self.action_space[i][action])

        sat_actions = actions[:self.num_sat_actions]
        therm_actions = actions[self.num_sat_actions:self.num_therm_actions +
                                self.num_sat_actions]
        blind_actions = actions[self.num_therm_actions + self.num_sat_actions:]

        sat_actions_tups = []
        for action in sat_actions:
            action_stpt, sat_sp = augment_ma(state, action)
            sat_actions_tups.append((action_stpt, sat_sp))
        if len(sat_actions) == 0:
            # this is hacky but makes the parsing in the main file cleaner
            sat_actions_tups.append(([], []))

        return sat_actions_tups, therm_actions, blind_actions