def pick_action(self, obs, episode_id: int): # get list of tensors per q_net q_vals = [] obs_tens = obs_to_tens(obs) for q_net, agent_obs in zip(self.q_nets, obs_tens): q_agent = q_net.forward(agent_obs) q_vals.append(q_agent) # sample random values per agent samples = [random.random() for _ in range(self.n_agents)] eps_threshold = get_eps_threshold(episode_id, self.max_episodes) # either arg_max or random uniform per agent actions = [] for sample, agent_q_vals in zip(samples, q_vals): actions.append( self._pick_agent_action(sample, eps_threshold, agent_q_vals)) return actions
def forward(self, obs): """ TODO optimize model and forward Get list of arrays with observation """ assert len(obs) == self.n_agents obs_t = obs_to_tens(obs, self.device) q_values = [] for agent_obs, q_net in zip(obs, self.q_nets): q_vals = q_net(agent_obs) q_values.append(q_vals) # should produce [BS=1, obs_size, n_agents] observations = torch.stack(obs_t) q_values_stacked = torch.stack(q_values) mixed_values = self.mixer.forward(q_values_stacked, observations) # TODO not tested!! return mixed_values
# Commented out IPython magic to ensure Python compatibility. # %pdb on from matplotlib import rc rc('animation', html='jshtml') steps_done = 0 all_rewards = [] all_eps_thresholds = [] all_losses = [] vcg_mech = VCG(agents) for i_episode in tqdm(range(MAX_EPISODES)): obs = obs_to_tens(env.reset(), device) [agent.reset() for agent in agents] for agent, init_obs in zip(agents, obs): # first state agent.last_obs = init_obs ep_losses = [] ep_rewards = [] ep_q_values = [] for t in count(): q_values_per_agent = [] actions = [] for agent, a_obs in zip(agents, obs): q_values = agent.q_values(a_obs)