def choose_action(self, explore): possible_actions = list(copy.copy(Action.SET)) no_collision_ls = [1.0 for _ in Action.SET] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) next_state = bound_action(next_state, self.world_size[0], self.world_size[1]) for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] if np.linalg.norm(cur_agent_state - next_state) > 1.0 + self.collision_distance: continue movement = np.rint(next_state - cur_agent_state) action = get_action(movement, self.world_size) if action == -1: continue if action in self.action_counts[agent]: no_collision_ls[a] *= (1 - self.action_counts[agent][ (int(cur_agent_state[0]), int(cur_agent_state[1]) )][action] / np.sum(self.action_counts[agent][(int( cur_agent_state[0]), int(cur_agent_state[1]))])) else: no_collision_ls[a] *= 0.75 for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) q = self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist() if len(possible_actions) == 0: best_action = np.argmax(no_collision_ls) self.action_traj.append(best_action) return best_action best_q = -math.inf best_action = Action.UP for action in possible_actions: if q[action] > best_q: best_q = q[action] best_action = action if explore or np.random.binomial(1, self.eps) == 1: best_action = possible_actions[np.random.choice( len(possible_actions))] self.action_traj.append(best_action) return best_action
def step(self, actions): assert len(actions) == self.num_agents, "number of agents mismatch" mu, sigma = 0, 0.1 for agent, action in actions: self.states[agent] = self.states[agent] \ + get_movement(action) \ + np.random.normal(mu, sigma, 2) self.states[agent] = bound_action(self.states[agent], self.world_shape[0], self.world_shape[1])
def _returnable(self, state): for a in Action.SET: next_state = self._move_coordinate(state, a) next_state = bound_action(next_state, self.world_size[0], self.world_size[1]) reward, std = self.reward_gp.predict(np.array([next_state]), True) reward = reward[0] std = std[0] if reward - self.beta * std > self.reward_threshold: return True return False
def choose_action(self, explore=True): possible_actions = copy.copy(Action.SET) action_next_states = [] reward_ls = [] reward_uncertainty = [] no_collision_ls = [1.0 for _ in Action.SET] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) next_state = bound_action(next_state, self.world_size[0], self.world_size[1]) reward, std = self.reward_gp.predict(np.array([next_state]), return_std=True) reward = reward[0] std = std[0] action_next_states += [(a, next_state)] reward_ls += [reward - self.beta * std] reward_uncertainty += [std] for action, next_state in action_next_states: for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] if np.linalg.norm(cur_agent_state - next_state) > 1.0 + self.collision_distance: continue movement = np.rint(next_state - cur_agent_state) a = get_action(movement, self.world_size) if a == -1: continue no_collision_ls[action] *= 0.75 for i, l in enumerate(reward_ls): if l < self.reward_threshold or not self._returnable( action_next_states[i][1]): possible_actions.remove(i) for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) if len(possible_actions) == 0: return np.argmax(reward_ls) return np.argmax(reward_uncertainty)
def choose_action(self, explore=False): possible_actions = copy.copy(Action.SET) action_next_states = [] reward_ls = [] reward_uncertainty = [] no_collision_ls = [1.0 for _ in Action.SET] best_action = Action.UP for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) next_state = bound_action(next_state, self.world_size[0], self.world_size[1]) reward, std = self.reward_gp.predict(np.array([next_state]), return_std=True) reward = reward[0] std = std[0] action_next_states += [(a, next_state)] reward_ls += [reward - self.beta * std] reward_uncertainty += [std] for action, next_state in action_next_states: for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] if np.linalg.norm(cur_agent_state - next_state) > 1.0 + self.collision_distance: continue movement = np.rint(next_state - cur_agent_state) a = get_action(movement, self.world_size) if a == -1: continue a_prob = self._get_policy(agent, a) no_collision_ls[action] *= (1 - a_prob) for i, l in enumerate(reward_ls): if l < self.reward_threshold or not self._returnable(action_next_states[i][1]): possible_actions.remove(i) for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) possible_actions = list(possible_actions) if explore or np.random.binomial(1, self.eps) == 1: most_uncertain_action = Action.UP largest_uncertainty = -math.inf for action in possible_actions: if reward_uncertainty[action] > largest_uncertainty: most_uncertain_action = action largest_uncertainty = reward_uncertainty[action] best_action = most_uncertain_action else: best_q_action = Action.UP best_q = -math.inf q_values = self.target( torch.tensor(self.states[self.index], dtype=torch.double) ).tolist() for action in possible_actions: if q_values[action] > best_q: best_q_action = action best_q = q_values[action] best_action = best_q_action if len(possible_actions) == 0: # joint_prob = np.array(reward_ls) * np.array(no_collision_ls) # best_action = np.argmax(joint_prob) best_action = np.argmax(no_collision_ls) self.action_traj += [best_action] return best_action
def _discrete_value_iteration(self): diff = 10000 x_range = int(self.world_size[0] / self.discrete_interval + 1) y_range = int(self.world_size[0] / self.discrete_interval + 1) temp_vf = copy.deepcopy(self.discrete_value_function) temp_vf_u = copy.deepcopy(self.discrete_value_function_u) temp_vf_l = copy.deepcopy(self.discrete_value_function_l) while diff > 0.01: cur_diff = 0.0 for i in range(x_range): for j in range(y_range): # Value Iteration Update x_coord = i * self.discrete_interval y_coord = j * self.discrete_interval state = [x_coord, y_coord] cur_reward, cur_std = self.reward_gp.predict( np.array([state]), return_std=True) cur_reward = cur_reward[0] cur_std = cur_std[0] cur_value = -1000000.0 best_next_state = state for action in Action.SET: new_state = bound_action( self._move_coordinate(state, action), x_range * self.discrete_interval, y_range * self.discrete_interval, interval=self.discrete_interval, ) new_state_x = int(new_state[0] / self.discrete_interval) new_state_y = int(new_state[1] / self.discrete_interval) new_value = cur_reward \ + self.gamma * temp_vf[new_state_x, new_state_y] if new_value > cur_value: best_next_state = new_state temp_vf[i, j] = new_value cur_value = new_value # Compute new confidence bounds best_next_state_x = int(best_next_state[0] / self.discrete_interval) best_next_state_y = int(best_next_state[1] / self.discrete_interval) next_value_var = ( temp_vf_u[best_next_state_x, best_next_state_y] - temp_vf_l[best_next_state_x, best_next_state_y]) / 2 std_dev = np.sqrt((self.beta**2) * (cur_std**2) + self.gamma**2 * next_value_var) temp_vf_u[i, j] = temp_vf[i, j] + std_dev temp_vf_l[i, j] = temp_vf[i, j] - std_dev cur_diff = max(cur_diff, abs(temp_vf[i, j] - cur_value)) diff = cur_diff self.discrete_value_function = temp_vf self.discrete_value_function_u = temp_vf_u self.discrete_value_function_l = temp_vf_l