def choose_action(self, explore):
        possible_actions = list(copy.copy(Action.SET))
        no_collision_ls = [1.0 for _ in Action.SET]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            next_state = bound_action(next_state, self.world_size[0],
                                      self.world_size[1])

            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]
                if np.linalg.norm(cur_agent_state -
                                  next_state) > 1.0 + self.collision_distance:
                    continue

                movement = np.rint(next_state - cur_agent_state)
                action = get_action(movement, self.world_size)
                if action == -1:
                    continue

                if action in self.action_counts[agent]:
                    no_collision_ls[a] *= (1 - self.action_counts[agent][
                        (int(cur_agent_state[0]), int(cur_agent_state[1])
                         )][action] / np.sum(self.action_counts[agent][(int(
                             cur_agent_state[0]), int(cur_agent_state[1]))]))
                else:
                    no_collision_ls[a] *= 0.75

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        q = self.target(
            torch.tensor(self.states[self.index],
                         dtype=torch.double)).tolist()

        if len(possible_actions) == 0:
            best_action = np.argmax(no_collision_ls)
            self.action_traj.append(best_action)
            return best_action

        best_q = -math.inf
        best_action = Action.UP
        for action in possible_actions:
            if q[action] > best_q:
                best_q = q[action]
                best_action = action

        if explore or np.random.binomial(1, self.eps) == 1:
            best_action = possible_actions[np.random.choice(
                len(possible_actions))]

        self.action_traj.append(best_action)
        return best_action
Esempio n. 2
0
 def step(self, actions):
     assert len(actions) == self.num_agents, "number of agents mismatch"
     mu, sigma = 0, 0.1
     for agent, action in actions:
         self.states[agent] = self.states[agent] \
                 + get_movement(action) \
                 + np.random.normal(mu, sigma, 2)
         self.states[agent] = bound_action(self.states[agent],
                                           self.world_shape[0],
                                           self.world_shape[1])
Esempio n. 3
0
    def _returnable(self, state):
        for a in Action.SET:
            next_state = self._move_coordinate(state, a)
            next_state = bound_action(next_state, self.world_size[0], self.world_size[1])
            reward, std = self.reward_gp.predict(np.array([next_state]), True)
            reward = reward[0]
            std = std[0]
            if reward - self.beta * std > self.reward_threshold:
                return True

        return False
Esempio n. 4
0
    def choose_action(self, explore=True):
        possible_actions = copy.copy(Action.SET)
        action_next_states = []
        reward_ls = []
        reward_uncertainty = []
        no_collision_ls = [1.0 for _ in Action.SET]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            next_state = bound_action(next_state, self.world_size[0],
                                      self.world_size[1])
            reward, std = self.reward_gp.predict(np.array([next_state]),
                                                 return_std=True)
            reward = reward[0]
            std = std[0]
            action_next_states += [(a, next_state)]
            reward_ls += [reward - self.beta * std]
            reward_uncertainty += [std]

        for action, next_state in action_next_states:
            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]
                if np.linalg.norm(cur_agent_state -
                                  next_state) > 1.0 + self.collision_distance:
                    continue

                movement = np.rint(next_state - cur_agent_state)
                a = get_action(movement, self.world_size)
                if a == -1:
                    continue

                no_collision_ls[action] *= 0.75

        for i, l in enumerate(reward_ls):
            if l < self.reward_threshold or not self._returnable(
                    action_next_states[i][1]):
                possible_actions.remove(i)

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        if len(possible_actions) == 0:
            return np.argmax(reward_ls)

        return np.argmax(reward_uncertainty)
Esempio n. 5
0
    def choose_action(self, explore=False):
        possible_actions = copy.copy(Action.SET)
        action_next_states = []
        reward_ls = []
        reward_uncertainty = []
        no_collision_ls = [1.0 for _ in Action.SET]
        best_action = Action.UP

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            next_state = bound_action(next_state, self.world_size[0], self.world_size[1])
            reward, std = self.reward_gp.predict(np.array([next_state]), return_std=True)
            reward = reward[0]
            std = std[0]
            action_next_states += [(a, next_state)]
            reward_ls += [reward - self.beta * std]
            reward_uncertainty += [std]

        for action, next_state in action_next_states:
            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]
                if np.linalg.norm(cur_agent_state - next_state) > 1.0 + self.collision_distance:
                    continue

                movement = np.rint(next_state - cur_agent_state)
                a = get_action(movement, self.world_size)
                if a == -1:
                    continue

                a_prob = self._get_policy(agent, a)
                no_collision_ls[action] *= (1 - a_prob)

        for i, l in enumerate(reward_ls):
            if l < self.reward_threshold or not self._returnable(action_next_states[i][1]):
                possible_actions.remove(i)

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        possible_actions = list(possible_actions)
        if explore or np.random.binomial(1, self.eps) == 1:
            most_uncertain_action = Action.UP
            largest_uncertainty = -math.inf
            for action in possible_actions:
                if reward_uncertainty[action] > largest_uncertainty:
                    most_uncertain_action = action
                    largest_uncertainty = reward_uncertainty[action]

            best_action = most_uncertain_action
        else:
            best_q_action = Action.UP
            best_q = -math.inf
            q_values = self.target(
                torch.tensor(self.states[self.index], dtype=torch.double)
            ).tolist()
            for action in possible_actions:
                if q_values[action] > best_q:
                    best_q_action = action
                    best_q = q_values[action]
            best_action = best_q_action

        if len(possible_actions) == 0:
            # joint_prob = np.array(reward_ls) * np.array(no_collision_ls)
            # best_action = np.argmax(joint_prob)
            best_action = np.argmax(no_collision_ls)

        self.action_traj += [best_action]
        return best_action
    def _discrete_value_iteration(self):
        diff = 10000
        x_range = int(self.world_size[0] / self.discrete_interval + 1)
        y_range = int(self.world_size[0] / self.discrete_interval + 1)
        temp_vf = copy.deepcopy(self.discrete_value_function)
        temp_vf_u = copy.deepcopy(self.discrete_value_function_u)
        temp_vf_l = copy.deepcopy(self.discrete_value_function_l)

        while diff > 0.01:
            cur_diff = 0.0
            for i in range(x_range):
                for j in range(y_range):
                    # Value Iteration Update
                    x_coord = i * self.discrete_interval
                    y_coord = j * self.discrete_interval
                    state = [x_coord, y_coord]
                    cur_reward, cur_std = self.reward_gp.predict(
                        np.array([state]), return_std=True)
                    cur_reward = cur_reward[0]
                    cur_std = cur_std[0]
                    cur_value = -1000000.0

                    best_next_state = state
                    for action in Action.SET:
                        new_state = bound_action(
                            self._move_coordinate(state, action),
                            x_range * self.discrete_interval,
                            y_range * self.discrete_interval,
                            interval=self.discrete_interval,
                        )
                        new_state_x = int(new_state[0] /
                                          self.discrete_interval)
                        new_state_y = int(new_state[1] /
                                          self.discrete_interval)
                        new_value = cur_reward \
                            + self.gamma * temp_vf[new_state_x, new_state_y]

                        if new_value > cur_value:
                            best_next_state = new_state
                            temp_vf[i, j] = new_value
                            cur_value = new_value

                    # Compute new confidence bounds
                    best_next_state_x = int(best_next_state[0] /
                                            self.discrete_interval)
                    best_next_state_y = int(best_next_state[1] /
                                            self.discrete_interval)
                    next_value_var = (
                        temp_vf_u[best_next_state_x, best_next_state_y] -
                        temp_vf_l[best_next_state_x, best_next_state_y]) / 2
                    std_dev = np.sqrt((self.beta**2) * (cur_std**2) +
                                      self.gamma**2 * next_value_var)
                    temp_vf_u[i, j] = temp_vf[i, j] + std_dev
                    temp_vf_l[i, j] = temp_vf[i, j] - std_dev

                    cur_diff = max(cur_diff, abs(temp_vf[i, j] - cur_value))

            diff = cur_diff

        self.discrete_value_function = temp_vf
        self.discrete_value_function_u = temp_vf_u
        self.discrete_value_function_l = temp_vf_l