class MultiAgentPlanner(): def __init__(self, index, reward_threshold, collision_threshold, world_size, states, num_agents, collision_distance): self.index = index self.name = 'multi safe q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \ + WhiteKernel(noise_level=1) self.reward_gp = GaussianProcessRegressor(kernel=kernel) self.reward_threshold = reward_threshold self.collision_threshold = collision_threshold self.collision_distance = collision_distance self.trajs = [[] for _ in range(num_agents)] self.my_states = [] self.action_traj = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.dimensions = [3, 50, 50, 7] self.dqn = MLP(self.dimensions).double() self.dqn_l = MLP(self.dimensions).double() self.dqn_u = MLP(self.dimensions).double() self.optimizer = optim.RMSprop(self.dqn.parameters()) self.optimizer_l = optim.RMSprop(self.dqn_l.parameters()) self.optimizer_u = optim.RMSprop(self.dqn_u.parameters()) self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_l = MLP(self.dimensions).double() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() self.target_u = MLP(self.dimensions).double() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.lr = 1e-3 self.epsilons = [0. for _ in range(num_agents)] self.tau_exploits = [1. for _ in range(num_agents)] self.tau_explores = [1. for _ in range(num_agents)] self.num_collisions = 0 self.num_unsafe = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore=False): possible_actions = copy.copy(Action.SET) action_next_states = [] reward_ls = [] reward_uncertainty = [] no_collision_ls = [1.0 for _ in Action.SET] best_action = Action.STAY # best_action = np.argmax(self.target(torch.tensor(self.states[self.index])).tolist()) # # if explore or np.random.binomial(1, self.eps) == 1: # best_action = possible_actions[np.random.choice(len(possible_actions))] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) reward, std = self.reward_gp.predict(np.array([next_state]), return_std=True) reward = reward[0] std = std[0] action_next_states += [(a, next_state)] reward_ls += [reward - self.beta * std] reward_uncertainty += [std] for action, next_state in action_next_states: for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] for agent_action in Action.SET: possible_next_agent_state = cur_agent_state + get_movement( agent_action) if np.linalg.norm(possible_next_agent_state - next_state) < self.collision_distance: continue a_prob = self._get_policy(agent, agent_action) no_collision_ls[action] *= (1 - a_prob) # for i, l in enumerate(reward_ls): # if l <= self.reward_threshold or not self._returnable(action_next_states[i][1]): # possible_actions.remove(i) for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) possible_actions = list(possible_actions) if explore or np.random.binomial(1, self.eps) == 1: # most_uncertain_action = Action.STAY # largest_uncertainty = -math.inf # for action in possible_actions: # if reward_uncertainty[action] > largest_uncertainty: # most_uncertain_action = action # largest_uncertainty = reward_uncertainty[action] # # best_action = most_uncertain_action if len(possible_actions) > 0: best_action = possible_actions[np.random.choice( len(possible_actions))] else: best_q_action = Action.STAY best_q = -math.inf q_values = self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist() for action in possible_actions: if q_values[action] > best_q: best_q_action = action best_q = q_values[action] best_action = best_q_action if len(possible_actions) == 0: # joint_prob = np.array(reward_ls) * np.array(no_collision_ls) # best_action = np.argmax(joint_prob) best_action = np.argmax(no_collision_ls) self.action_traj += [best_action] return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) if len(self.rewards) > 50: self.rewards.pop(0) self.my_states.pop(0) self.rewards += [reward] self.states = states for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) > self.collision_distance: self.num_collisions += 1 break if reward < self.reward_threshold: self.num_unsafe += 1 for i in range(self.num_agents): self.trajs[i] += [states[i]] if i == self.index: self.my_states += [states[i]] self.cum_rewards += reward def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() self.trajs = [[] for _ in range(self.num_agents)] self.action_traj = [] self.states = states # self.epsilons = [0. for _ in range(self.num_agents)] # self.tau_exploits = [1. for _ in range(self.num_agents)] # self.tau_explores = [1. for _ in range(self.num_agents)] # self.rewards = [] # self.cum_rewards = 0 def learn_from_buffer(self): self.reward_gp.fit(self.my_states, self.rewards) self._value_func_estimate() for agent in range(self.num_agents): if agent == self.index: continue self._optimize_parameters(agent) def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) reward_batch = torch.cat(batch.reward) reward_l_batch = [] reward_u_batch = [] for state in state_batch: cur_state = state.tolist() reward, std = self.reward_gp.predict(np.array([cur_state]), True) reward_l_batch.append( torch.tensor([reward[0] - self.beta * std[0]], dtype=torch.double)) reward_u_batch.append( torch.tensor([reward[0] + self.beta * std[0]], dtype=torch.double)) reward_l_batch = torch.cat(reward_l_batch) reward_u_batch = torch.cat(reward_u_batch) action_batch = torch.cat(batch.action) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() state_action_values = self.dqn_u(state_batch).gather(1, action_batch) next_state_values = self.target_u(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_u_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer_u.zero_grad() loss.backward() for param in self.dqn_u.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_u.step() state_action_values = self.dqn_l(state_batch).gather(1, action_batch) next_state_values = self.target_l(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_l_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer_l.zero_grad() loss.backward() for param in self.dqn_l.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_l.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state def _get_policy(self, agent, action): epsilon = self.epsilons[agent] tau_explore = self.tau_explores[agent] tau_exploit = self.tau_exploits[agent] return self._compute_policy_upperbound(epsilon, tau_explore, tau_exploit, agent, action) def _returnable(self, state): for a in Action.SET: next_state = self._move_coordinate(state, a) reward, std = self.reward_gp.predict(np.array([next_state]), True) reward = reward[0] std = std[0] if reward - self.beta * std >= self.reward_threshold: return True return False def _optimize_parameters(self, agent): traj = self.trajs[agent] if len(traj) > 20: traj = traj[len(traj) - 20:] def _compute_log_likelihood(parameters): epsilon = parameters[0] tau_explore = parameters[1] tau_exploit = parameters[2] sum_log_likelihood = 1.0 for step in range(1, len(traj)): prev_state = traj[step - 1] cur_state = traj[step] movement = np.rint(cur_state - prev_state) action = get_action(movement, self.world_size) if action == -1: continue sum_log_likelihood *= (self._compute_policy_upperbound( epsilon, tau_explore, tau_exploit, agent, action)) return -np.log(sum_log_likelihood) res = minimize(_compute_log_likelihood, np.array([0.5, 1.0, 1.0]), method='L-BFGS-B', bounds=np.array([(1e-6, 1.0), (0.1, 10.0), (0.1, 10.0)])) if not np.all(np.equal(res.x, np.array([0.5, 1.0, 1.0]))): self.epsilons[agent] = res.x[0] self.tau_explores[agent] = res.x[1] self.tau_exploits[agent] = res.x[2] def _compute_policy_upperbound(self, epsilon, tau_explore, tau_exploit, agent, action): q = self.dqn(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() q_u = self.dqn_u(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() q_l = self.dqn_l(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() ofu_denom = copy.copy(q) ofu_denom[action] = q_u[action] boltz_denom = copy.copy(q_l) boltz_denom[action] = q[action] explore_mean_q = np.mean(q_u / tau_explore) prob_ofu = np.exp(q_u[action] / tau_explore - explore_mean_q) / np.sum( np.exp(ofu_denom / tau_explore - explore_mean_q)) exploit_mean_q = np.mean(q / tau_exploit) prob_boltz = np.exp(q[action] / tau_exploit - exploit_mean_q) / np.sum( np.exp(boltz_denom / tau_exploit - exploit_mean_q)) return epsilon * prob_ofu + (1 - epsilon) * prob_boltz
class QLearningAgent(): def __init__(self, index, world_size, states, num_agents, collision_distance): self.index = index self.name = 'q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.action_traj = [] self.num_collisions = 0 self.collision_distance = collision_distance self.dimensions = [2, 5, 5, 4] self.dqn = MLP(self.dimensions).double() self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.RMSprop(self.dqn.parameters()) self.lr = 1e-4 self.num_collisions = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore): possible_actions = list(copy.copy(Action.SET)) best_action = np.argmax( self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist()) if explore or np.random.binomial(1, self.eps) == 1: best_action = possible_actions[np.random.choice( len(possible_actions))] self.action_traj.append(best_action) return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) self.rewards += [reward] self.states = states for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) < self.collision_distance: self.num_collisions += 1 break self.cum_rewards += reward def learn_from_buffer(self): self._value_func_estimate() def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.action_traj = [] self.states = states self.rewards = [] def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state
class YahooDQNAgent(): def __init__( self, initial_feed_candidates, user_features, feed_counts, agent_name: str, feed_feature_count = 6, user_feature_count = 6, model_dims: List[int] = [50, 25], lr: float = 1e-3, boltzmann: bool = True, epsilon: float = 0.05, batch_size: int = 128, ): self.initial_feed_candidates = initial_feed_candidates self.current_feed_candidates = initial_feed_candidates self.user_features = user_features self.feed_counts = feed_counts self.agent_name = agent_name self.interest_level = 0 self.cum_rewards: float = 0. self.feed_feature_count = feed_feature_count self.user_feature_count = user_feature_count self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count self.training_data: ReplayMemory = ReplayMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [1] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.target_net = MLP(self.model_dims).double().to(device) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.boltzmann: bool = boltzmann self.epsilon: float = epsilon self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_actions = [] self.latest_feature = None self.current_feed = 0 self.cum_reward_history: List[float] = [] def choose_action(self): available_actions = [candidate.features for candidate in self.current_feed_candidates] features = [-1. for _ in range(self.num_features)] for index, action in enumerate(self.history_actions): features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action features[-self.user_feature_count:] = self.user_features candidate_features = [] for f in available_actions: candidate_feature = np.copy(features) candidate_feature[ self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count ] = f candidate_features.append(candidate_feature) candidate_features = np.array(candidate_features) # base_feature.append(self.interest_level) with torch.no_grad(): outcomes = self.model( torch.tensor(candidate_features, dtype=torch.double).to(device) ) _, best_index = torch.max(outcomes, 0) best_index = best_index.item() if self.boltzmann: outcomes = outcomes / 0.05 best_index = np.random.choice( len(available_actions), p=torch.nn.functional.softmax(outcomes.reshape((len(available_actions))), dim=0).cpu().numpy() ) elif np.random.rand() < 0.05: best_index = np.random.choice(len(available_actions)) best_action = self.current_feed_candidates[best_index] self.latest_feature = candidate_features[best_index] self.history_actions.append(best_action.features) self.current_feed += 1 return best_action def update_buffer( self, scroll: bool, reward: int, new_batch ): # print(reward) self.cum_rewards += reward self.current_feed_candidates = new_batch if not scroll: self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double).to(device), torch.tensor([reward], dtype=torch.double).to(device), None, ) return available_actions = [candidate.features for candidate in self.current_feed_candidates] features: List[float] = [-1. for _ in range(self.num_features)] for index, action in enumerate(self.history_actions): features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action features[-self.user_feature_count:] = self.user_features candidate_features = [] for f in available_actions: candidate_feature = np.copy(features) candidate_feature[ self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count ] = f candidate_features.append(candidate_feature) candidate_features = np.array(candidate_features) self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double).to(device), torch.tensor([reward], dtype=torch.double).to(device), torch.tensor([candidate_features], dtype=torch.double).to(device), ) def learn_from_buffer(self): if len(self.training_data) < self.batch_size: return loss_ensemble = 0. for i in range(0, 10): transitions = self.training_data.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) state_batch = torch.cat(batch.state) reward_batch = torch.cat(batch.reward) state_action_values = self.model(state_batch) all_none = True for s in batch.next_state: if s is not None: all_none = False next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double) if not all_none: non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].reshape((-1)).detach() expected_state_action_values = self.gamma * next_state_values + reward_batch loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1)) loss_ensemble += loss.item() self.optimizer.zero_grad() loss.backward() for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble self.epsilon = 0.999 * self.epsilon def reset(self, user_features, initial_feeds, user_embedding): self.cum_rewards: float = 0. self.interest_level = 0. self.latest_feature = None self.current_feed_candidates = initial_feeds self.target_net.load_state_dict(self.model.state_dict()) self.target_net.double() self.target_net.eval() self.target_net.to(device) self.history_actions = [] self.cum_reward_history.append(self.cum_rewards) self.current_feed = 0 self.user_features = user_features
class DQNAgent(Agent): def __init__( self, feed_units: List[int], agent_name: str, model_dims: List[int] = [], lr: float = 1e-3, boltzmann: bool = False, epsilon: float = 0.05, batch_size: int = 128, ): self.feed_units = copy.deepcopy(feed_units) self.agent_name = agent_name self.interest_level = 0 self.cum_rewards: float = 0. self.num_features: int = len(feed_units) self.training_data: ReplayMemory = ReplayMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [2] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.target_net = MLP(self.model_dims).double().to(device) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.boltzmann: bool = boltzmann self.epsilon: float = epsilon self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_unit_indices: List[int] = [] self.latest_feature = None self.latest_action = None self.current_feed = 0 self.cum_reward_history: List[float] = [] self.current_loc = [0, 0] def choose_action(self): available_actions = [0, 1] features: List[float] = [-1. for _ in range(self.num_features)] for index in range(self.current_feed): features[index] = 0. for index in self.history_unit_indices: features[index] = 1. # base_feature.append(self.interest_level) with torch.no_grad(): outcomes = self.model(torch.tensor(features, dtype=torch.double)) _, best_index = torch.max(outcomes, 0) best_index = best_index.item() best_action = [available_actions[best_index]] self.latest_feature = features self.latest_action = best_action if best_action[0] == 1: self.history_unit_indices.append(self.current_feed) self.current_feed += 1 if np.random.rand() < self.epsilon: return np.random.randint(2) # print(best_action) return best_action[0] def update_buffer( self, scroll: bool, reward: int, ): # print(reward) self.cum_rewards += reward if not scroll: self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double), torch.tensor([self.latest_action], dtype=torch.long), torch.tensor([reward], dtype=torch.double), None, ) return features: List[float] = [-1. for _ in range(self.num_features)] for index in range(self.current_feed): features[index] = 0. for index in self.history_unit_indices: features[index] = 1. self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double), torch.tensor([self.latest_action], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([features], dtype=torch.double), ) def learn_from_buffer(self): if len(self.training_data) < self.batch_size: return try: loss_ensemble = 0. for i in range(0, 10): transitions = self.training_data.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = self.model(state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = self.gamma * next_state_values + reward_batch loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1)) loss_ensemble += loss.item() self.optimizer.zero_grad() loss.backward() # for param in self.model.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizer.step() self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble self.epsilon = 0.999 * self.epsilon except: print('{}: no non-terminal state'.format(self.agent_name)) def reset(self): self.cum_rewards: float = 0. self.interest_level = 0. self.latest_feature = None self.latest_action = None self.target_net.load_state_dict(self.model.state_dict()) self.target_net.double() self.target_net.eval() self.target_net.to(device) self.history_unit_indices = [] self.cum_reward_history.append(self.cum_rewards) self.current_loc = [0, 0] self.current_feed = 0
class NaiveSafeQLearningAgent(): def __init__(self, index, world_size, states, num_agents, collision_distance, collision_threshold, reward_threshold): self.index = index self.name = 'naive q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.action_traj = [] self.num_collisions = 0 self.collision_distance = collision_distance self.collision_threshold = collision_threshold self.reward_threshold = reward_threshold self.dimensions = [2, 5, 5, 4] self.dqn = MLP(self.dimensions).double() self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.RMSprop(self.dqn.parameters()) self.lr = 1e-4 self.num_collisions = 0 self.num_unsafe = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore): possible_actions = list(copy.copy(Action.SET)) no_collision_ls = [1.0 for _ in Action.SET] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) next_state = bound_action(next_state, self.world_size[0], self.world_size[1]) for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] if np.linalg.norm(cur_agent_state - next_state) > 1.0 + self.collision_distance: continue movement = np.rint(next_state - cur_agent_state) action = get_action(movement, self.world_size) if action == -1: continue no_collision_ls[a] *= 0.75 for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) q = self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist() if len(possible_actions) == 0: best_action = np.argmax(no_collision_ls) self.action_traj.append(best_action) return best_action best_q = -math.inf best_action = Action.UP for action in possible_actions: if q[action] > best_q: best_q = q[action] best_action = action if explore or np.random.binomial(1, self.eps) == 1: best_action = possible_actions[np.random.choice( len(possible_actions))] self.action_traj.append(best_action) return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) self.rewards += [reward] self.states = states if reward < self.reward_threshold: self.num_unsafe += 1 for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) < self.collision_distance: self.num_collisions += 1 break self.cum_rewards += reward def learn_from_buffer(self): self._value_func_estimate() def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.action_traj = [] self.states = states self.rewards = [] def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state