def setup(self, scheme, groups, preprocess, mac): self.new_batch = partial(EpisodeBatch, scheme, groups, self.batch_size, self.episode_limit + 1, preprocess=preprocess, device=self.args.device) self.mac = mac self.scheme = scheme self.groups = groups self.preprocess = preprocess # Setup the noise distribution sampler if self.args.noise_bandit: if self.args.bandit_policy: self.noise_distrib = enza(self.args, logger=self.logger) else: self.noise_distrib = RBandit(self.args, logger=self.logger) else: self.noise_distrib = Uniform(self.args) self.noise_returns = {} self.noise_test_won = {} self.noise_train_won = {}
class ParallelRunner: def __init__(self, args, logger): self.args = args self.logger = logger self.batch_size = self.args.batch_size_run # Make subprocesses for the envs self.parent_conns, self.worker_conns = zip( *[Pipe() for _ in range(self.batch_size)]) env_fn = env_REGISTRY[self.args.env] self.ps = [ Process(target=env_worker, args=(worker_conn, CloudpickleWrapper( partial(env_fn, env_args=self.args.env_args, args=self.args)))) for worker_conn in self.worker_conns ] for p in self.ps: p.daemon = True p.start() self.parent_conns[0].send(("get_env_info", None)) self.env_info = self.parent_conns[0].recv() self.episode_limit = self.env_info["episode_limit"] self.t = 0 self.t_env = 0 self.train_returns = [] self.test_returns = [] self.train_stats = {} self.test_stats = {} self.log_train_stats_t = -100000 def cuda(self): if self.args.noise_bandit: self.noise_distrib.cuda() def setup(self, scheme, groups, preprocess, mac): self.new_batch = partial(EpisodeBatch, scheme, groups, self.batch_size, self.episode_limit + 1, preprocess=preprocess, device=self.args.device) self.mac = mac self.scheme = scheme self.groups = groups self.preprocess = preprocess # Setup the noise distribution sampler if self.args.noise_bandit: if self.args.bandit_policy: self.noise_distrib = enza(self.args, logger=self.logger) else: self.noise_distrib = RBandit(self.args, logger=self.logger) else: self.noise_distrib = Uniform(self.args) self.noise_returns = {} self.noise_test_won = {} self.noise_train_won = {} def get_env_info(self): return self.env_info def save_replay(self): pass def close_env(self): for parent_conn in self.parent_conns: parent_conn.send(("close", None)) def reset(self, test_mode=False): self.batch = self.new_batch() # Reset the envs for parent_conn in self.parent_conns: parent_conn.send(("reset", None)) pre_transition_data = {"state": [], "avail_actions": [], "obs": []} # Get the obs, state and avail_actions back for parent_conn in self.parent_conns: data = parent_conn.recv() pre_transition_data["state"].append(data["state"]) pre_transition_data["avail_actions"].append(data["avail_actions"]) pre_transition_data["obs"].append(data["obs"]) self.batch.update(pre_transition_data, ts=0) # Sample the noise at the beginning of the episode self.noise = self.noise_distrib.sample(self.batch['state'][:, 0], test_mode) self.batch.update({"noise": self.noise}, ts=0) self.t = 0 self.env_steps_this_run = 0 if "map_name" in self.args.env_args and self.args.env_args[ "map_name"] == "2_corridors": if self.t_env > 5 * 1000 * 1000: for parent_conn in self.parent_conns: parent_conn.send(("close_corridor", None)) if "map_name" in self.args.env_args and self.args.env_args[ "map_name"] == "bunker_vs_6m": if self.t_env > 3 * 1000 * 1000: for parent_conn in self.parent_conns: parent_conn.send(("avail_bunker", None)) def run(self, test_mode=False, test_uniform=False): self.reset(test_uniform) all_terminated = False episode_returns = [0 for _ in range(self.batch_size)] episode_lengths = [0 for _ in range(self.batch_size)] self.mac.init_hidden(batch_size=self.batch_size) terminated = [False for _ in range(self.batch_size)] envs_not_terminated = [ b_idx for b_idx, termed in enumerate(terminated) if not termed ] final_env_infos = [] while True: # Pass the entire batch of experiences up till now to the agents # Receive the actions for each agent at this timestep in a batch for each un-terminated env actions = self.mac.select_actions(self.batch, t_ep=self.t, t_env=self.t_env, bs=envs_not_terminated, test_mode=test_mode) cpu_actions = actions.to("cpu").numpy() # Update the actions taken actions_chosen = {"actions": actions.unsqueeze(1)} self.batch.update(actions_chosen, bs=envs_not_terminated, ts=self.t, mark_filled=False) # Update terminated envs after adding post_transition_data envs_not_terminated = [ b_idx for b_idx, termed in enumerate(terminated) if not termed ] all_terminated = all(terminated) if all_terminated: break # Send actions to each env action_idx = 0 for idx, parent_conn in enumerate(self.parent_conns): if idx in envs_not_terminated: # We produced actions for this env if not terminated[ idx]: # Only send the actions to the env if it hasn't terminated parent_conn.send(("step", cpu_actions[action_idx])) action_idx += 1 # actions is not a list over every env # Post step data we will insert for the current timestep post_transition_data = {"reward": [], "terminated": []} # Data for the next step we will insert in order to select an action pre_transition_data = {"state": [], "avail_actions": [], "obs": []} # Receive data back for each unterminated env for idx, parent_conn in enumerate(self.parent_conns): if not terminated[idx]: data = parent_conn.recv() # Remaining data for this current timestep post_transition_data["reward"].append((data["reward"], )) episode_returns[idx] += data["reward"] episode_lengths[idx] += 1 if not test_mode: self.env_steps_this_run += 1 env_terminated = False if data["terminated"]: final_env_infos.append(data["info"]) if data["terminated"] and not data["info"].get( "episode_limit", False): env_terminated = True terminated[idx] = data["terminated"] post_transition_data["terminated"].append( (env_terminated, )) # Data for the next timestep needed to select an action pre_transition_data["state"].append(data["state"]) pre_transition_data["avail_actions"].append( data["avail_actions"]) pre_transition_data["obs"].append(data["obs"]) # Add post_transiton data into the batch self.batch.update(post_transition_data, bs=envs_not_terminated, ts=self.t, mark_filled=False) # Move onto the next timestep self.t += 1 # Add the pre-transition data self.batch.update(pre_transition_data, bs=envs_not_terminated, ts=self.t, mark_filled=True) if not test_mode: self.t_env += self.env_steps_this_run # Get stats back for each env for parent_conn in self.parent_conns: parent_conn.send(("get_stats", None)) env_stats = [] for parent_conn in self.parent_conns: env_stat = parent_conn.recv() env_stats.append(env_stat) cur_stats = self.test_stats if test_mode else self.train_stats cur_returns = self.test_returns if test_mode else self.train_returns log_prefix = "test_" if test_mode else "" if test_uniform: log_prefix += "uni_" infos = [cur_stats] + final_env_infos cur_stats.update({ k: sum(d.get(k, 0) for d in infos) for k in set.union(*[set(d) for d in infos]) }) cur_stats["n_episodes"] = self.batch_size + cur_stats.get( "n_episodes", 0) cur_stats["ep_length"] = sum(episode_lengths) + cur_stats.get( "ep_length", 0) cur_returns.extend(episode_returns) self._update_noise_returns(episode_returns, self.noise, final_env_infos, test_mode) self.noise_distrib.update_returns(self.batch['state'][:, 0], self.noise, episode_returns, test_mode, self.t_env) n_test_runs = max( 1, self.args.test_nepisode // self.batch_size) * self.batch_size if test_mode and (len(self.test_returns) == n_test_runs): self._log_noise_returns(test_mode, test_uniform) self._log(cur_returns, cur_stats, log_prefix) elif self.t_env - self.log_train_stats_t >= self.args.runner_log_interval: self._log_noise_returns(test_mode, test_uniform) self._log(cur_returns, cur_stats, log_prefix) if hasattr(self.mac.action_selector, "epsilon"): self.logger.log_stat("epsilon", self.mac.action_selector.epsilon, self.t_env) self.log_train_stats_t = self.t_env return self.batch def _log(self, returns, stats, prefix): self.logger.log_stat(prefix + "return_mean", np.mean(returns), self.t_env) self.logger.log_stat(prefix + "return_std", np.std(returns), self.t_env) returns.clear() for k, v in stats.items(): if k != "n_episodes": self.logger.log_stat(prefix + k + "_mean", v / stats["n_episodes"], self.t_env) stats.clear() def _update_noise_returns(self, returns, noise, stats, test_mode): for n, r in zip(noise, returns): n = int(np.argmax(n)) if n in self.noise_returns: self.noise_returns[n].append(r) else: self.noise_returns[n] = [r] if test_mode: noise_won = self.noise_test_won else: noise_won = self.noise_train_won if stats != [] and "battle_won" in stats[0]: for n, info in zip(noise, stats): if "battle_won" not in info: continue bw = info["battle_won"] n = int(np.argmax(n)) if n in noise_won: noise_won[n].append(bw) else: noise_won[n] = [bw] def _log_noise_returns(self, test_mode, test_uniform): if test_mode: max_noise_return = -100000 for n, rs in self.noise_returns.items(): n_item = n r_mean = float(np.mean(rs)) max_noise_return = max(r_mean, max_noise_return) self.logger.log_stat( "{}_noise_test_ret_u_{:1}".format(n_item, test_uniform), r_mean, self.t_env) self.logger.log_stat( "max_noise_test_ret_u_{:1}".format(test_uniform), max_noise_return, self.t_env) noise_won = self.noise_test_won prefix = "test" if test_uniform: prefix += "_uni" if not test_mode: noise_won = self.noise_train_won prefix = "train" if len(noise_won.keys()) > 0: max_test_won = 0 for n, rs in noise_won.items(): n_item = n #int(np.argmax(n)) r_mean = float(np.mean(rs)) max_test_won = max(r_mean, max_test_won) self.logger.log_stat("{}_noise_{}_won".format(n_item, prefix), r_mean, self.t_env) self.logger.log_stat("max_noise_{}_won".format(prefix), max_test_won, self.t_env) self.noise_returns = {} self.noise_test_won = {} self.noise_train_won = {} def save_models(self, path): if self.args.noise_bandit: self.noise_distrib.save_model(path)
class MetaNoiseRunner: def __init__(self, args, logger): self.args = args self.logger = logger self.batch_size = self.args.batch_size_run # Make subprocesses for the envs self.parent_conns, self.worker_conns = zip( *[Pipe() for _ in range(self.batch_size)]) env_fn = env_REGISTRY[self.args.env] self.ps = [ Process(target=env_worker, args=(worker_conn, CloudpickleWrapper( partial(env_fn, **self.args.env_args)))) for worker_conn in self.worker_conns ] for p in self.ps: p.daemon = True p.start() self.parent_conns[0].send(("get_env_info", None)) self.env_info = self.parent_conns[0].recv() self.episode_limit = self.env_info["episode_limit"] self.t = 0 self.t_env = 0 self.train_returns = [] self.test_returns = [] self.train_stats = {} self.test_stats = {} self.log_train_stats_t = -100000 def setup(self, scheme, groups, preprocess, mac): self.new_batch = partial(EpisodeBatch, scheme, groups, self.batch_size, self.episode_limit + 1, preprocess=preprocess, device=self.args.device) self.new_batch_single = partial(EpisodeBatch, scheme, groups, 1, self.episode_limit + 1, preprocess=preprocess, device=self.args.device) self.mac = mac self.scheme = scheme self.groups = groups self.preprocess = preprocess if self.args.noise_bandit: if self.args.bandit_policy: self.noise_distrib = enza(self.args, logger=self.logger) else: self.noise_distrib = RBandit(self.args, logger=self.logger) else: self.noise_distrib = Uniform(self.args) self.noise_returns = {} self.noise_test_won = {} self.noise_train_won = {} def get_env_info(self): return self.env_info def save_replay(self): pass def close_env(self): for parent_conn in self.parent_conns: parent_conn.send(("close", None)) def reset(self, test_mode=False): self.batch = self.new_batch() # Reset the envs for parent_conn in self.parent_conns: parent_conn.send(("reset", None)) pre_transition_data = {"state": [], "avail_actions": [], "obs": []} # Get the obs, state and avail_actions back for parent_conn in self.parent_conns: data = parent_conn.recv() pre_transition_data["state"].append(data["state"]) pre_transition_data["avail_actions"].append(data["avail_actions"]) pre_transition_data["obs"].append(data["obs"]) self.batch.update(pre_transition_data, ts=0) self.noise = self.noise_distrib.sample(self.batch['state'][:, 0], test_mode) self.batch.update({"noise": self.noise}, ts=0) self.t = 0 self.env_steps_this_run = 0 def reset_first(self, test_mode=False): self.batch = self.new_batch_single() self.parent_conns[0].send(("reset", None)) pre_transition_data = {"state": [], "avail_actions": [], "obs": []} data = self.parent_conns[0].recv() pre_transition_data["state"].append(data["state"]) pre_transition_data["avail_actions"].append(data["avail_actions"]) pre_transition_data["obs"].append(data["obs"]) self.batch.update(pre_transition_data, ts=0) if test_mode or not self.args.noise_bandit: self.noise = self.noise_distrib.sample(self.batch['state'][:, 0], test_mode)[0:1, :] else: self.noise = self.noise_distrib.sample(self.batch['state'][:, 0], test_mode) self.batch.update({"noise": self.noise}, ts=0) self.t = 0 def run(self, test_mode=False, meta_mode=False, test_uniform=False, use_rode=False): self.reset_first(test_uniform) episode_return = 0.0 episode_length = 0 if self.args.q_net_ensemble: chosen_index = random.randint(0, self.args.ensemble_num - 1) chosen_mac = self.mac[chosen_index] else: chosen_mac = self.mac chosen_mac.init_hidden(batch_size=1) if self.args.mac == "separate_mac" or self.args.mac == "hierarchical_mac" or self.args.use_roma: chosen_mac.init_latent(batch_size=1) terminated = False final_env_infos = [ ] # may store extra stats like battle won. this is filled in ORDER OF TERMINATION if meta_mode: log_ps = [] while True: if meta_mode: action, logp = chosen_mac.select_actions(self.batch, t_ep=self.t, t_env=self.t_env, test_mode=test_mode, need_log_p=meta_mode) log_ps.append(logp) else: action = chosen_mac.select_actions(self.batch, t_ep=self.t, t_env=self.t_env, test_mode=test_mode, need_log_p=meta_mode) cpu_action = action.to("cpu").numpy() # Update the actions taken action_chosen = { "actions": action.unsqueeze(1), } self.batch.update(action_chosen, ts=self.t, mark_filled=False) # Send actions to each env if terminated: break self.parent_conns[0].send(("step", cpu_action[0])) post_transition_data = {"reward": [], "terminated": []} pre_transition_data = {"state": [], "avail_actions": [], "obs": []} data = self.parent_conns[0].recv() post_transition_data["reward"].append((data["reward"], )) episode_return += data["reward"] episode_length += 1 env_terminated = False if data["terminated"] and not data["info"].get( "episode_limit", False): env_terminated = True terminated = data["terminated"] post_transition_data["terminated"].append((env_terminated, )) pre_transition_data["state"].append(data["state"]) pre_transition_data["avail_actions"].append(data["avail_actions"]) pre_transition_data["obs"].append(data["obs"]) self.batch.update(post_transition_data, ts=self.t, mark_filled=False) self.t += 1 self.batch.update(pre_transition_data, ts=self.t, mark_filled=True) # collect log p for meta policy gradient if meta_mode: all_log_p = th.cat([it.unsqueeze(1) for it in log_ps[:-1]], dim=1) #[8*max_ep_len*3] batch_log_p = th.sum(all_log_p, [1, 2]) / all_log_p.size(1) if not test_mode: self.t_env += self.t cur_stats = self.test_stats if test_mode else self.train_stats cur_returns = self.test_returns if test_mode else self.train_returns log_prefix = "test_" if test_mode else "" infos = [cur_stats] + final_env_infos cur_stats.update({ k: sum(d.get(k, 0) for d in infos) for k in set.union(*[set(d) for d in infos]) }) cur_stats["n_episodes"] = 1 + cur_stats.get("n_episodes", 0) cur_stats["ep_length"] = episode_length + cur_stats.get("ep_length", 0) cur_returns.append(episode_return) self._update_noise_returns([episode_return], self.noise, final_env_infos, test_mode) self.noise_distrib.update_returns(self.batch['state'][:, 0], self.noise, [episode_return], test_mode, self.t_env) n_test_runs = max(1, self.args.test_nepisode) if test_mode and (len(self.test_returns) == n_test_runs): self._log(cur_returns, cur_stats, log_prefix) elif not test_mode and self.t_env - self.log_train_stats_t >= self.args.runner_log_interval: self._log(cur_returns, cur_stats, log_prefix) if hasattr(chosen_mac.action_selector, "epsilon"): self.logger.log_stat("epsilon", chosen_mac.action_selector.epsilon, self.t_env) self.log_train_stats_t = self.t_env final_reward = [episode_return / episode_length ] if self.args.use_step_reward else episode_return if meta_mode: return self.batch, batch_log_p, final_reward else: return self.batch, final_reward def run_meta(self, test_mode=False, meta_mode=False, test_uniform=False, use_rode=False): self.reset(test_uniform) all_terminated = False episode_returns = [0 for _ in range(self.batch_size)] episode_lengths = [0 for _ in range(self.batch_size)] if self.args.q_net_ensemble: chosen_index = random.randint(0, self.args.ensemble_num - 1) chosen_mac = self.mac[chosen_index] else: chosen_mac = self.mac chosen_mac.init_hidden(batch_size=self.batch_size) if self.args.mac == "separate_mac" or self.args.mac == "hierarchical_mac" or self.args.use_roma: chosen_mac.init_latent(batch_size=self.batch_size) terminated = [False for _ in range(self.batch_size)] envs_not_terminated = [ b_idx for b_idx, termed in enumerate(terminated) if not termed ] final_env_infos = [ ] # may store extra stats like battle won. this is filled in ORDER OF TERMINATION if meta_mode: log_ps = [] while True: if meta_mode: actions, logp = chosen_mac.select_actions( self.batch, t_ep=self.t, t_env=self.t_env, bs=envs_not_terminated, test_mode=test_mode, need_log_p=meta_mode) log_ps.append(logp) else: actions = chosen_mac.select_actions(self.batch, t_ep=self.t, t_env=self.t_env, bs=envs_not_terminated, test_mode=test_mode, need_log_p=meta_mode) # Pass the entire batch of experiences up till now to the agents # Receive the actions for each agent at this timestep in a batch for each un-terminated env cpu_actions = actions.to("cpu").numpy() # Update the actions taken actions_chosen = {"actions": actions.unsqueeze(1)} self.batch.update(actions_chosen, bs=envs_not_terminated, ts=self.t, mark_filled=False) # Send actions to each env action_idx = 0 for idx, parent_conn in enumerate(self.parent_conns): if idx in envs_not_terminated: # We produced actions for this env if not terminated[ idx]: # Only send the actions to the env if it hasn't terminated parent_conn.send(("step", cpu_actions[action_idx])) action_idx += 1 # actions is not a list over every env # Update envs_not_terminated envs_not_terminated = [ b_idx for b_idx, termed in enumerate(terminated) if not termed ] all_terminated = all(terminated) if all_terminated: break # Post step data we will insert for the current timestep post_transition_data = {"reward": [], "terminated": []} # Data for the next step we will insert in order to select an action pre_transition_data = {"state": [], "avail_actions": [], "obs": []} # Receive data back for each unterminated env for idx, parent_conn in enumerate(self.parent_conns): if not terminated[idx]: data = parent_conn.recv() # Remaining data for this current timestep post_transition_data["reward"].append((data["reward"], )) episode_returns[idx] += data["reward"] episode_lengths[idx] += 1 if not test_mode: self.env_steps_this_run += 1 env_terminated = False if data["terminated"]: final_env_infos.append(data["info"]) if data["terminated"] and not data["info"].get( "episode_limit", False): env_terminated = True terminated[idx] = data["terminated"] post_transition_data["terminated"].append( (env_terminated, )) # Data for the next timestep needed to select an action pre_transition_data["state"].append(data["state"]) pre_transition_data["avail_actions"].append( data["avail_actions"]) pre_transition_data["obs"].append(data["obs"]) # Add post_transiton data into the batch self.batch.update(post_transition_data, bs=envs_not_terminated, ts=self.t, mark_filled=False) # Move onto the next timestep self.t += 1 # Add the pre-transition data self.batch.update(pre_transition_data, bs=envs_not_terminated, ts=self.t, mark_filled=True) # collect log p for meta policy gradient if meta_mode: all_log_p = th.cat([it.unsqueeze(1) for it in log_ps[:-1]], dim=1) #[8*max_ep_len*3] ind = th.zeros( [self.batch_size, max(episode_lengths)], device=self.batch.device) for i in range(self.batch_size): ind[i, :episode_lengths[i]] = 1.0 batch_log_p = th.sum(all_log_p * ind.unsqueeze(2), [1, 2]) / th.sum(ind, 1) if not test_mode: self.t_env += self.t # Get stats back for each env for parent_conn in self.parent_conns: parent_conn.send(("get_stats", None)) env_stats = [] for parent_conn in self.parent_conns: env_stat = parent_conn.recv() env_stats.append(env_stat) cur_stats = self.test_stats if test_mode else self.train_stats cur_returns = self.test_returns if test_mode else self.train_returns log_prefix = "test_" if test_mode else "" if test_uniform: log_prefix += "uni_" infos = [cur_stats] + final_env_infos cur_stats.update({ k: sum(d.get(k, 0) for d in infos) for k in set.union(*[set(d) for d in infos]) }) cur_stats["n_episodes"] = self.batch_size + cur_stats.get( "n_episodes", 0) cur_stats["ep_length"] = sum(episode_lengths) + cur_stats.get( "ep_length", 0) cur_returns.extend(episode_returns) self._update_noise_returns(episode_returns, self.noise, final_env_infos, test_mode) self.noise_distrib.update_returns(self.batch['state'][:, 0], self.noise, episode_returns, test_mode, self.t_env) n_test_runs = max( 1, self.args.test_nepisode // self.batch_size) * self.batch_size if test_mode and (len(self.test_returns) == n_test_runs): self._log(cur_returns, cur_stats, log_prefix) elif not test_mode and self.t_env - self.log_train_stats_t >= self.args.runner_log_interval: self._log(cur_returns, cur_stats, log_prefix) if hasattr(chosen_mac.action_selector, "epsilon"): self.logger.log_stat("epsilon", chosen_mac.action_selector.epsilon, self.t_env) self.log_train_stats_t = self.t_env final_reward = [ i / j for i, j in zip(episode_returns, episode_lengths) ] if self.args.use_step_reward else episode_returns if meta_mode: return self.batch, batch_log_p, final_reward else: return self.batch, final_reward def get_log_p(self, buffer): if self.args.q_net_ensemble: chosen_index = random.randint(0, self.args.ensemble_num - 1) chosen_mac = self.mac[chosen_index] else: chosen_mac = self.mac chosen_mac.init_hidden(batch_size=buffer.batch_size) if self.args.use_roma: chosen_mac.init_latent(buffer.batch_size) buffer.to(self.batch.device) log_ps = [] terminated = th.zeros(buffer.batch_size, device=self.batch.device) ind = th.zeros([buffer.batch_size, buffer.max_seq_length], device=self.batch.device) max_ep_len = 0 for i in range(buffer.max_seq_length): envs_not_terminated = [ b_idx for b_idx, termed in enumerate(terminated) if termed < 0.01 ] ra = chosen_mac.select_actions(buffer, t_ep=i, t_env=self.t_env, bs=envs_not_terminated, test_mode=False, need_log_p=True) log_p = ra[-1] if type(log_p) == tuple: log_p = log_p[0] + log_p[1] log_ps.append(log_p) ind[~(terminated.round().to(th.bool)), i] = 1.0 terminated += buffer["terminated"][:, i, 0] if sum(terminated).round().item() == buffer.batch_size: max_ep_len = i + 1 break if max_ep_len == 0: raise Exception("Some episodes have no 'terminated' mark.") ind = ind[:, :max_ep_len] all_log_p = th.cat([it.unsqueeze(1) for it in log_ps], dim=1) #[32*max_ep_len*3] # for i in range(self.batch_size): # ind[i, :episode_lengths[i]] = 1.0 batch_log_p = th.sum(all_log_p * ind.unsqueeze(2), [1, 2]) / th.sum( ind, 1) return batch_log_p def _update_noise_returns(self, returns, noise, stats, test_mode): for n, r in zip(noise, returns): n = int(np.argmax(n)) if n in self.noise_returns: self.noise_returns[n].append(r) else: self.noise_returns[n] = [r] if test_mode: noise_won = self.noise_test_won else: noise_won = self.noise_train_won if stats != [] and "battle_won" in stats[0]: for n, info in zip(noise, stats): if "battle_won" not in info: continue bw = info["battle_won"] n = int(np.argmax(n)) if n in noise_won: noise_won[n].append(bw) else: noise_won[n] = [bw] def _log_noise_returns(self, test_mode, test_uniform): if test_mode: max_noise_return = -100000 for n, rs in self.noise_returns.items(): n_item = n r_mean = float(np.mean(rs)) max_noise_return = max(r_mean, max_noise_return) self.logger.log_stat( "{}_noise_test_ret_u_{:1}".format(n_item, test_uniform), r_mean, self.t_env) self.logger.log_stat( "max_noise_test_ret_u_{:1}".format(test_uniform), max_noise_return, self.t_env) noise_won = self.noise_test_won prefix = "test" if test_uniform: prefix += "_uni" if not test_mode: noise_won = self.noise_train_won prefix = "train" if len(noise_won.keys()) > 0: max_test_won = 0 for n, rs in noise_won.items(): n_item = n #int(np.argmax(n)) r_mean = float(np.mean(rs)) max_test_won = max(r_mean, max_test_won) self.logger.log_stat("{}_noise_{}_won".format(n_item, prefix), r_mean, self.t_env) self.logger.log_stat("max_noise_{}_won".format(prefix), max_test_won, self.t_env) self.noise_returns = {} self.noise_test_won = {} self.noise_train_won = {} def _log(self, returns, stats, prefix): self.logger.log_stat(prefix + "return_mean", np.mean(returns), self.t_env) self.logger.log_stat(prefix + "return_std", np.std(returns), self.t_env) returns.clear() for k, v in stats.items(): if k != "n_episodes": self.logger.log_stat(prefix + k + "_mean", v / stats["n_episodes"], self.t_env) stats.clear() def cuda(self): if self.args.noise_bandit: self.noise_distrib.cuda()