class DQNAgent(object): def __init__(self, device): self.device = device self.env = None self.model = QModule().to(self.device) self.target = copy.deepcopy(self.model) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=3e-4) self.BATCH_SIZE = 10 self.REPLAY_MEMORY_SIZE = 1_000_000 self.replay_memory = ReplayMemory(self.device, maximum_size=self.REPLAY_MEMORY_SIZE) self.EPS_MIN = 0.01 self.EPS_EP = 100 self.GAMMA = 0.99 self.TAU = 0.005 self.explore = False def set(self, env): self.env = env def memorize(self, current_state, action, reward, next_state, done): self.replay_memory.push(current_state, action, reward, next_state, done) def update_target(self, ): with torch.no_grad(): for model_kernel, target_kernel in zip(self.model.parameters(), self.target.parameters()): target_kernel.copy_((1 - self.TAU) * target_kernel + self.TAU * model_kernel) def train(self): if len(self.replay_memory) < self.BATCH_SIZE: return samples = self.replay_memory.sample(self.BATCH_SIZE) current_states, actions, rewards, next_states, dones = \ samples["state"], samples["action"], samples["reward"], samples["next_state"], samples["done"] with torch.no_grad(): next_action = self.model(next_states).argmax(dim=-1, keepdim=True) next_q = self.target(next_states).gather(1, next_action).squeeze() y = rewards + (1.0 - dones) * self.GAMMA * next_q actions = torch.LongTensor(get_actions_number(actions)).to(self.device) current_q = self.model(current_states).gather( 1, actions.view(1, self.BATCH_SIZE))[0] self.optimizer.zero_grad() loss = f.mse_loss(current_q, y.detach()) loss.backward() self.optimizer.step() self.update_target() def act(self, state, episode): if self.explore: exploration = max((self.EPS_MIN - 1) / self.EPS_EP * episode + 1, self.EPS_MIN) if random.random() < exploration: return get_action(random.randint(0, 6)) state = np.array([state]) state = torch.FloatTensor(state).to(self.device) with torch.no_grad(): output = self.model(state).detach().cpu().numpy() action = get_action(np.argmax(output)) return action
class MultiAgentPlanner(): def __init__(self, index, reward_threshold, collision_threshold, world_size, states, num_agents, collision_distance): self.index = index self.name = 'multi safe q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \ + WhiteKernel(noise_level=1) self.reward_gp = GaussianProcessRegressor(kernel=kernel) self.reward_threshold = reward_threshold self.collision_threshold = collision_threshold self.collision_distance = collision_distance self.trajs = [[] for _ in range(num_agents)] self.my_states = [] self.action_traj = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.dimensions = [3, 50, 50, 7] self.dqn = MLP(self.dimensions).double() self.dqn_l = MLP(self.dimensions).double() self.dqn_u = MLP(self.dimensions).double() self.optimizer = optim.RMSprop(self.dqn.parameters()) self.optimizer_l = optim.RMSprop(self.dqn_l.parameters()) self.optimizer_u = optim.RMSprop(self.dqn_u.parameters()) self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_l = MLP(self.dimensions).double() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() self.target_u = MLP(self.dimensions).double() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.lr = 1e-3 self.epsilons = [0. for _ in range(num_agents)] self.tau_exploits = [1. for _ in range(num_agents)] self.tau_explores = [1. for _ in range(num_agents)] self.num_collisions = 0 self.num_unsafe = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore=False): possible_actions = copy.copy(Action.SET) action_next_states = [] reward_ls = [] reward_uncertainty = [] no_collision_ls = [1.0 for _ in Action.SET] best_action = Action.STAY # best_action = np.argmax(self.target(torch.tensor(self.states[self.index])).tolist()) # # if explore or np.random.binomial(1, self.eps) == 1: # best_action = possible_actions[np.random.choice(len(possible_actions))] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) reward, std = self.reward_gp.predict(np.array([next_state]), return_std=True) reward = reward[0] std = std[0] action_next_states += [(a, next_state)] reward_ls += [reward - self.beta * std] reward_uncertainty += [std] for action, next_state in action_next_states: for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] for agent_action in Action.SET: possible_next_agent_state = cur_agent_state + get_movement( agent_action) if np.linalg.norm(possible_next_agent_state - next_state) < self.collision_distance: continue a_prob = self._get_policy(agent, agent_action) no_collision_ls[action] *= (1 - a_prob) # for i, l in enumerate(reward_ls): # if l <= self.reward_threshold or not self._returnable(action_next_states[i][1]): # possible_actions.remove(i) for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) possible_actions = list(possible_actions) if explore or np.random.binomial(1, self.eps) == 1: # most_uncertain_action = Action.STAY # largest_uncertainty = -math.inf # for action in possible_actions: # if reward_uncertainty[action] > largest_uncertainty: # most_uncertain_action = action # largest_uncertainty = reward_uncertainty[action] # # best_action = most_uncertain_action if len(possible_actions) > 0: best_action = possible_actions[np.random.choice( len(possible_actions))] else: best_q_action = Action.STAY best_q = -math.inf q_values = self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist() for action in possible_actions: if q_values[action] > best_q: best_q_action = action best_q = q_values[action] best_action = best_q_action if len(possible_actions) == 0: # joint_prob = np.array(reward_ls) * np.array(no_collision_ls) # best_action = np.argmax(joint_prob) best_action = np.argmax(no_collision_ls) self.action_traj += [best_action] return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) if len(self.rewards) > 50: self.rewards.pop(0) self.my_states.pop(0) self.rewards += [reward] self.states = states for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) > self.collision_distance: self.num_collisions += 1 break if reward < self.reward_threshold: self.num_unsafe += 1 for i in range(self.num_agents): self.trajs[i] += [states[i]] if i == self.index: self.my_states += [states[i]] self.cum_rewards += reward def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() self.trajs = [[] for _ in range(self.num_agents)] self.action_traj = [] self.states = states # self.epsilons = [0. for _ in range(self.num_agents)] # self.tau_exploits = [1. for _ in range(self.num_agents)] # self.tau_explores = [1. for _ in range(self.num_agents)] # self.rewards = [] # self.cum_rewards = 0 def learn_from_buffer(self): self.reward_gp.fit(self.my_states, self.rewards) self._value_func_estimate() for agent in range(self.num_agents): if agent == self.index: continue self._optimize_parameters(agent) def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) reward_batch = torch.cat(batch.reward) reward_l_batch = [] reward_u_batch = [] for state in state_batch: cur_state = state.tolist() reward, std = self.reward_gp.predict(np.array([cur_state]), True) reward_l_batch.append( torch.tensor([reward[0] - self.beta * std[0]], dtype=torch.double)) reward_u_batch.append( torch.tensor([reward[0] + self.beta * std[0]], dtype=torch.double)) reward_l_batch = torch.cat(reward_l_batch) reward_u_batch = torch.cat(reward_u_batch) action_batch = torch.cat(batch.action) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() state_action_values = self.dqn_u(state_batch).gather(1, action_batch) next_state_values = self.target_u(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_u_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer_u.zero_grad() loss.backward() for param in self.dqn_u.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_u.step() state_action_values = self.dqn_l(state_batch).gather(1, action_batch) next_state_values = self.target_l(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_l_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer_l.zero_grad() loss.backward() for param in self.dqn_l.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_l.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state def _get_policy(self, agent, action): epsilon = self.epsilons[agent] tau_explore = self.tau_explores[agent] tau_exploit = self.tau_exploits[agent] return self._compute_policy_upperbound(epsilon, tau_explore, tau_exploit, agent, action) def _returnable(self, state): for a in Action.SET: next_state = self._move_coordinate(state, a) reward, std = self.reward_gp.predict(np.array([next_state]), True) reward = reward[0] std = std[0] if reward - self.beta * std >= self.reward_threshold: return True return False def _optimize_parameters(self, agent): traj = self.trajs[agent] if len(traj) > 20: traj = traj[len(traj) - 20:] def _compute_log_likelihood(parameters): epsilon = parameters[0] tau_explore = parameters[1] tau_exploit = parameters[2] sum_log_likelihood = 1.0 for step in range(1, len(traj)): prev_state = traj[step - 1] cur_state = traj[step] movement = np.rint(cur_state - prev_state) action = get_action(movement, self.world_size) if action == -1: continue sum_log_likelihood *= (self._compute_policy_upperbound( epsilon, tau_explore, tau_exploit, agent, action)) return -np.log(sum_log_likelihood) res = minimize(_compute_log_likelihood, np.array([0.5, 1.0, 1.0]), method='L-BFGS-B', bounds=np.array([(1e-6, 1.0), (0.1, 10.0), (0.1, 10.0)])) if not np.all(np.equal(res.x, np.array([0.5, 1.0, 1.0]))): self.epsilons[agent] = res.x[0] self.tau_explores[agent] = res.x[1] self.tau_exploits[agent] = res.x[2] def _compute_policy_upperbound(self, epsilon, tau_explore, tau_exploit, agent, action): q = self.dqn(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() q_u = self.dqn_u(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() q_l = self.dqn_l(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() ofu_denom = copy.copy(q) ofu_denom[action] = q_u[action] boltz_denom = copy.copy(q_l) boltz_denom[action] = q[action] explore_mean_q = np.mean(q_u / tau_explore) prob_ofu = np.exp(q_u[action] / tau_explore - explore_mean_q) / np.sum( np.exp(ofu_denom / tau_explore - explore_mean_q)) exploit_mean_q = np.mean(q / tau_exploit) prob_boltz = np.exp(q[action] / tau_exploit - exploit_mean_q) / np.sum( np.exp(boltz_denom / tau_exploit - exploit_mean_q)) return epsilon * prob_ofu + (1 - epsilon) * prob_boltz
class DeepExpIDSAgent(Agent): def __init__(self, feed_units: List[int], agent_name: str, ensemble_size: int = 100, prior_variance: float = 1.0, model_dims: List[int] = [20], lr: float = 1e-3, batch_size: int = 128, noise_variance=0): self.feed_units = copy.deepcopy(feed_units) # self.available_units = copy.deepcopy(feed_units) self.agent_name = agent_name self.cum_rewards: float = 0. self.interest_level = 0. self.num_features: int = len(feed_units) + 1 self.noise_variance = noise_variance self.ensemble_size: int = ensemble_size self.training_data = ReplayMemory(100000) # self.training_datas = [] # for i in range(self.ensemble_size): # self.training_datas.append(ReplayMemory(100000)) self.latest_feature = None self.latest_action = None self.prior_variance = prior_variance self.model_dims: List[int] = [self.num_features] + model_dims + [2] priors = [] for i in range(self.ensemble_size): priors.append(MLP(self.model_dims)) priors[i].initialize() priors[i].double() priors[i].eval() priors[i].to(device) self.models: List[DQNWithPrior] = [] for i in range(self.ensemble_size): self.models.append( DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance))) self.models[i].initialize() self.models[i].double() self.models[i].to(device) self.target_nets: List[DQNWithPrior] = [] for i in range(self.ensemble_size): self.target_nets.append( DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance))) self.target_nets[i].load_state_dict(self.models[i].state_dict()) self.target_nets[i].double() self.target_nets[i].eval() self.target_nets[i].to(device) self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizers = [] for i in range(self.ensemble_size): self.optimizers.append( optim.Adam(self.models[i].parameters(), lr=lr)) self.cur_net = self.target_nets[np.random.choice(self.ensemble_size)] self.batch_size = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_unit_indices: List[int] = [] self.cum_reward_history: List[float] = [] self.current_feed = 0 def choose_action(self): available_actions = [0, 1] features: List[float] = [-1. for _ in range(self.num_features)] for index in range(self.current_feed): features[index] = 0. for index in self.history_unit_indices: features[index] = 1. with torch.no_grad(): all_outcomes = [ self.target_nets[model_index](torch.tensor(features, dtype=torch.double)) for model_index in range(self.ensemble_size) ] mean_immediate_regret = self.mean_immediate_regret(all_outcomes) var_immediate_regret = self.var_immediate_regret( all_outcomes, len(available_actions)) best_index = self.best_ids_action(mean_immediate_regret, var_immediate_regret) best_action = [available_actions[best_index]] self.latest_feature = features self.latest_action = best_action if best_action[0] == 1: self.history_unit_indices.append(self.current_feed) self.current_feed += 1 # print('action: {}'.format(best_action[0])) return best_action[0] def mean_immediate_regret(self, all_outcomes): sum_immediate_regret = None for model_index in range(self.ensemble_size): outcomes = all_outcomes[model_index] max_outcome, _ = torch.max(outcomes, 0) if sum_immediate_regret is None: sum_immediate_regret = max_outcome - outcomes else: sum_immediate_regret += max_outcome - outcomes return sum_immediate_regret / self.ensemble_size def var_immediate_regret(self, all_outcomes, num_actions): count_best_outcome = [0 for _ in range(num_actions)] sum_out_best = {} sum_out_all = None for model_index in range(self.ensemble_size): outcomes = all_outcomes[model_index] max_outcome, best_index = torch.max(outcomes, 0) count_best_outcome[best_index] += 1 if best_index in sum_out_best: sum_out_best[best_index] += outcomes else: sum_out_best[best_index] = outcomes if sum_out_all is None: sum_out_all = outcomes else: sum_out_all += outcomes var = torch.tensor([0. for _ in range(num_actions)]).double() for a in range(num_actions): if a not in sum_out_best: sum_out_best[a] = torch.tensor( [0. for _ in range(num_actions)]).double() coeff = count_best_outcome[a] / self.ensemble_size if coeff == 0: continue sum_err = (1 / count_best_outcome[a] * sum_out_best[a][a] - 1 / num_actions * sum_out_all[a])**2 var[a] = coeff * sum_err.item() return var def best_ids_action(self, mean_immediate_regret, var_immediate_regret): regret_sq = mean_immediate_regret**2 info_gain = torch.log(1 + var_immediate_regret) + 1e-5 return torch.argmin(regret_sq / info_gain) def update_buffer( self, scroll: bool, reward: int, ): # print('reward: {}'.format(reward)) self.cum_rewards += reward if not scroll: self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double), torch.tensor([self.latest_action], dtype=torch.long), torch.tensor([reward], dtype=torch.double), None, ) return features: List[float] = [-1. for _ in range(self.num_features)] for index in range(self.current_feed): features[index] = 0. for index in self.history_unit_indices: features[index] = 1. self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double), torch.tensor([self.latest_action], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([features], dtype=torch.double), ) def learn_from_buffer(self): if len(self.training_data) < self.batch_size: return loss_ensemble = 0.0 # try: for _ in range(10): transitions = self.training_data.sample(self.batch_size) for i in range(self.ensemble_size): batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = self.models[i](state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double) next_state_values[non_final_mask] = self.target_nets[i]( non_final_next_states).max(1)[0].detach() expected_state_action_values = self.gamma * next_state_values + reward_batch loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1)) loss_ensemble += loss.item() self.optimizers[i].zero_grad() loss.backward() # for param in self.model.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizers[i].step() self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble # except: # print('{}: no non-terminal state'.format(self.agent_name)) def reset(self): self.available_units = copy.deepcopy(self.feed_units) self.cum_rewards: float = 0. self.interest_level = 0. self.latest_feature = None self.latest_action = None # self.history_unit_indices = [] self.cum_reward_history.append(self.cum_rewards) for i in range(self.ensemble_size): self.target_nets[i].load_state_dict(self.models[i].state_dict()) self.target_nets[i].double() self.target_nets[i].eval() self.target_nets[i].to(device) self.cur_net = self.target_nets[np.random.choice(self.ensemble_size)] self.history_unit_indices = [] self.cum_reward_history.append(self.cum_rewards) self.current_feed = 0 self.current_loc = [0, 0]
class QLearningAgent(): def __init__(self, index, world_size, states, num_agents, collision_distance): self.index = index self.name = 'q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.action_traj = [] self.num_collisions = 0 self.collision_distance = collision_distance self.dimensions = [2, 5, 5, 4] self.dqn = MLP(self.dimensions).double() self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.RMSprop(self.dqn.parameters()) self.lr = 1e-4 self.num_collisions = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore): possible_actions = list(copy.copy(Action.SET)) best_action = np.argmax( self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist()) if explore or np.random.binomial(1, self.eps) == 1: best_action = possible_actions[np.random.choice( len(possible_actions))] self.action_traj.append(best_action) return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) self.rewards += [reward] self.states = states for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) < self.collision_distance: self.num_collisions += 1 break self.cum_rewards += reward def learn_from_buffer(self): self._value_func_estimate() def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.action_traj = [] self.states = states self.rewards = [] def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state
print('wrong input:', action[0]) _, reward = gameEnv.moveSnake(keyPressed) done = gameEnv.gameOver reward = torch.tensor([reward], device=device) if not done: next_state = torch.tensor([[gameEnv.mapState]], device=device, dtype=torch.float32) else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model() if done: episode_durations.append(t + 1) episode_scores.append(gameEnv.score) print("Episode:", i_episode, "Duration: ", t + 1, "Score: ", gameEnv.score, "\n") break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0:
class NaiveSafeQLearningAgent(): def __init__(self, index, world_size, states, num_agents, collision_distance, collision_threshold, reward_threshold): self.index = index self.name = 'naive q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.action_traj = [] self.num_collisions = 0 self.collision_distance = collision_distance self.collision_threshold = collision_threshold self.reward_threshold = reward_threshold self.dimensions = [2, 5, 5, 4] self.dqn = MLP(self.dimensions).double() self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.RMSprop(self.dqn.parameters()) self.lr = 1e-4 self.num_collisions = 0 self.num_unsafe = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore): possible_actions = list(copy.copy(Action.SET)) no_collision_ls = [1.0 for _ in Action.SET] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) next_state = bound_action(next_state, self.world_size[0], self.world_size[1]) for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] if np.linalg.norm(cur_agent_state - next_state) > 1.0 + self.collision_distance: continue movement = np.rint(next_state - cur_agent_state) action = get_action(movement, self.world_size) if action == -1: continue no_collision_ls[a] *= 0.75 for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) q = self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist() if len(possible_actions) == 0: best_action = np.argmax(no_collision_ls) self.action_traj.append(best_action) return best_action best_q = -math.inf best_action = Action.UP for action in possible_actions: if q[action] > best_q: best_q = q[action] best_action = action if explore or np.random.binomial(1, self.eps) == 1: best_action = possible_actions[np.random.choice( len(possible_actions))] self.action_traj.append(best_action) return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) self.rewards += [reward] self.states = states if reward < self.reward_threshold: self.num_unsafe += 1 for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) < self.collision_distance: self.num_collisions += 1 break self.cum_rewards += reward def learn_from_buffer(self): self._value_func_estimate() def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.action_traj = [] self.states = states self.rewards = [] def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state