Ejemplo n.º 1
0
    def setup(self, scheme, groups, preprocess, mac):
        self.new_batch = partial(EpisodeBatch,
                                 scheme,
                                 groups,
                                 self.batch_size,
                                 self.episode_limit + 1,
                                 preprocess=preprocess,
                                 device=self.args.device)
        self.mac = mac
        self.scheme = scheme
        self.groups = groups
        self.preprocess = preprocess

        # Setup the noise distribution sampler
        if self.args.noise_bandit:
            if self.args.bandit_policy:
                self.noise_distrib = enza(self.args, logger=self.logger)
            else:
                self.noise_distrib = RBandit(self.args, logger=self.logger)
        else:
            self.noise_distrib = Uniform(self.args)

        self.noise_returns = {}
        self.noise_test_won = {}
        self.noise_train_won = {}
Ejemplo n.º 2
0
class ParallelRunner:
    def __init__(self, args, logger):
        self.args = args
        self.logger = logger
        self.batch_size = self.args.batch_size_run

        # Make subprocesses for the envs
        self.parent_conns, self.worker_conns = zip(
            *[Pipe() for _ in range(self.batch_size)])
        env_fn = env_REGISTRY[self.args.env]
        self.ps = [
            Process(target=env_worker,
                    args=(worker_conn,
                          CloudpickleWrapper(
                              partial(env_fn,
                                      env_args=self.args.env_args,
                                      args=self.args))))
            for worker_conn in self.worker_conns
        ]

        for p in self.ps:
            p.daemon = True
            p.start()

        self.parent_conns[0].send(("get_env_info", None))
        self.env_info = self.parent_conns[0].recv()
        self.episode_limit = self.env_info["episode_limit"]

        self.t = 0

        self.t_env = 0

        self.train_returns = []
        self.test_returns = []
        self.train_stats = {}
        self.test_stats = {}

        self.log_train_stats_t = -100000

    def cuda(self):
        if self.args.noise_bandit:
            self.noise_distrib.cuda()

    def setup(self, scheme, groups, preprocess, mac):
        self.new_batch = partial(EpisodeBatch,
                                 scheme,
                                 groups,
                                 self.batch_size,
                                 self.episode_limit + 1,
                                 preprocess=preprocess,
                                 device=self.args.device)
        self.mac = mac
        self.scheme = scheme
        self.groups = groups
        self.preprocess = preprocess

        # Setup the noise distribution sampler
        if self.args.noise_bandit:
            if self.args.bandit_policy:
                self.noise_distrib = enza(self.args, logger=self.logger)
            else:
                self.noise_distrib = RBandit(self.args, logger=self.logger)
        else:
            self.noise_distrib = Uniform(self.args)

        self.noise_returns = {}
        self.noise_test_won = {}
        self.noise_train_won = {}

    def get_env_info(self):
        return self.env_info

    def save_replay(self):
        pass

    def close_env(self):
        for parent_conn in self.parent_conns:
            parent_conn.send(("close", None))

    def reset(self, test_mode=False):
        self.batch = self.new_batch()

        # Reset the envs
        for parent_conn in self.parent_conns:
            parent_conn.send(("reset", None))

        pre_transition_data = {"state": [], "avail_actions": [], "obs": []}
        # Get the obs, state and avail_actions back
        for parent_conn in self.parent_conns:
            data = parent_conn.recv()
            pre_transition_data["state"].append(data["state"])
            pre_transition_data["avail_actions"].append(data["avail_actions"])
            pre_transition_data["obs"].append(data["obs"])

        self.batch.update(pre_transition_data, ts=0)

        # Sample the noise at the beginning of the episode
        self.noise = self.noise_distrib.sample(self.batch['state'][:, 0],
                                               test_mode)

        self.batch.update({"noise": self.noise}, ts=0)

        self.t = 0
        self.env_steps_this_run = 0

        if "map_name" in self.args.env_args and self.args.env_args[
                "map_name"] == "2_corridors":
            if self.t_env > 5 * 1000 * 1000:
                for parent_conn in self.parent_conns:
                    parent_conn.send(("close_corridor", None))

        if "map_name" in self.args.env_args and self.args.env_args[
                "map_name"] == "bunker_vs_6m":
            if self.t_env > 3 * 1000 * 1000:
                for parent_conn in self.parent_conns:
                    parent_conn.send(("avail_bunker", None))

    def run(self, test_mode=False, test_uniform=False):
        self.reset(test_uniform)

        all_terminated = False
        episode_returns = [0 for _ in range(self.batch_size)]
        episode_lengths = [0 for _ in range(self.batch_size)]
        self.mac.init_hidden(batch_size=self.batch_size)
        terminated = [False for _ in range(self.batch_size)]
        envs_not_terminated = [
            b_idx for b_idx, termed in enumerate(terminated) if not termed
        ]
        final_env_infos = []

        while True:

            # Pass the entire batch of experiences up till now to the agents
            # Receive the actions for each agent at this timestep in a batch for each un-terminated env
            actions = self.mac.select_actions(self.batch,
                                              t_ep=self.t,
                                              t_env=self.t_env,
                                              bs=envs_not_terminated,
                                              test_mode=test_mode)
            cpu_actions = actions.to("cpu").numpy()

            # Update the actions taken
            actions_chosen = {"actions": actions.unsqueeze(1)}
            self.batch.update(actions_chosen,
                              bs=envs_not_terminated,
                              ts=self.t,
                              mark_filled=False)

            # Update terminated envs after adding post_transition_data
            envs_not_terminated = [
                b_idx for b_idx, termed in enumerate(terminated) if not termed
            ]
            all_terminated = all(terminated)
            if all_terminated:
                break

            # Send actions to each env
            action_idx = 0
            for idx, parent_conn in enumerate(self.parent_conns):
                if idx in envs_not_terminated:  # We produced actions for this env
                    if not terminated[
                            idx]:  # Only send the actions to the env if it hasn't terminated
                        parent_conn.send(("step", cpu_actions[action_idx]))
                    action_idx += 1  # actions is not a list over every env

            # Post step data we will insert for the current timestep
            post_transition_data = {"reward": [], "terminated": []}
            # Data for the next step we will insert in order to select an action
            pre_transition_data = {"state": [], "avail_actions": [], "obs": []}

            # Receive data back for each unterminated env
            for idx, parent_conn in enumerate(self.parent_conns):
                if not terminated[idx]:
                    data = parent_conn.recv()
                    # Remaining data for this current timestep
                    post_transition_data["reward"].append((data["reward"], ))

                    episode_returns[idx] += data["reward"]
                    episode_lengths[idx] += 1
                    if not test_mode:
                        self.env_steps_this_run += 1

                    env_terminated = False
                    if data["terminated"]:
                        final_env_infos.append(data["info"])
                    if data["terminated"] and not data["info"].get(
                            "episode_limit", False):
                        env_terminated = True
                    terminated[idx] = data["terminated"]
                    post_transition_data["terminated"].append(
                        (env_terminated, ))

                    # Data for the next timestep needed to select an action
                    pre_transition_data["state"].append(data["state"])
                    pre_transition_data["avail_actions"].append(
                        data["avail_actions"])
                    pre_transition_data["obs"].append(data["obs"])

            # Add post_transiton data into the batch
            self.batch.update(post_transition_data,
                              bs=envs_not_terminated,
                              ts=self.t,
                              mark_filled=False)

            # Move onto the next timestep
            self.t += 1

            # Add the pre-transition data
            self.batch.update(pre_transition_data,
                              bs=envs_not_terminated,
                              ts=self.t,
                              mark_filled=True)

        if not test_mode:
            self.t_env += self.env_steps_this_run

        # Get stats back for each env
        for parent_conn in self.parent_conns:
            parent_conn.send(("get_stats", None))

        env_stats = []
        for parent_conn in self.parent_conns:
            env_stat = parent_conn.recv()
            env_stats.append(env_stat)

        cur_stats = self.test_stats if test_mode else self.train_stats
        cur_returns = self.test_returns if test_mode else self.train_returns
        log_prefix = "test_" if test_mode else ""
        if test_uniform:
            log_prefix += "uni_"
        infos = [cur_stats] + final_env_infos
        cur_stats.update({
            k: sum(d.get(k, 0) for d in infos)
            for k in set.union(*[set(d) for d in infos])
        })
        cur_stats["n_episodes"] = self.batch_size + cur_stats.get(
            "n_episodes", 0)
        cur_stats["ep_length"] = sum(episode_lengths) + cur_stats.get(
            "ep_length", 0)

        cur_returns.extend(episode_returns)

        self._update_noise_returns(episode_returns, self.noise,
                                   final_env_infos, test_mode)
        self.noise_distrib.update_returns(self.batch['state'][:, 0],
                                          self.noise, episode_returns,
                                          test_mode, self.t_env)

        n_test_runs = max(
            1, self.args.test_nepisode // self.batch_size) * self.batch_size
        if test_mode and (len(self.test_returns) == n_test_runs):
            self._log_noise_returns(test_mode, test_uniform)
            self._log(cur_returns, cur_stats, log_prefix)
        elif self.t_env - self.log_train_stats_t >= self.args.runner_log_interval:
            self._log_noise_returns(test_mode, test_uniform)
            self._log(cur_returns, cur_stats, log_prefix)
            if hasattr(self.mac.action_selector, "epsilon"):
                self.logger.log_stat("epsilon",
                                     self.mac.action_selector.epsilon,
                                     self.t_env)
            self.log_train_stats_t = self.t_env

        return self.batch

    def _log(self, returns, stats, prefix):
        self.logger.log_stat(prefix + "return_mean", np.mean(returns),
                             self.t_env)
        self.logger.log_stat(prefix + "return_std", np.std(returns),
                             self.t_env)
        returns.clear()

        for k, v in stats.items():
            if k != "n_episodes":
                self.logger.log_stat(prefix + k + "_mean",
                                     v / stats["n_episodes"], self.t_env)
        stats.clear()

    def _update_noise_returns(self, returns, noise, stats, test_mode):
        for n, r in zip(noise, returns):
            n = int(np.argmax(n))
            if n in self.noise_returns:
                self.noise_returns[n].append(r)
            else:
                self.noise_returns[n] = [r]
        if test_mode:
            noise_won = self.noise_test_won
        else:
            noise_won = self.noise_train_won

        if stats != [] and "battle_won" in stats[0]:
            for n, info in zip(noise, stats):
                if "battle_won" not in info:
                    continue
                bw = info["battle_won"]
                n = int(np.argmax(n))
                if n in noise_won:
                    noise_won[n].append(bw)
                else:
                    noise_won[n] = [bw]

    def _log_noise_returns(self, test_mode, test_uniform):
        if test_mode:
            max_noise_return = -100000
            for n, rs in self.noise_returns.items():
                n_item = n
                r_mean = float(np.mean(rs))
                max_noise_return = max(r_mean, max_noise_return)
                self.logger.log_stat(
                    "{}_noise_test_ret_u_{:1}".format(n_item, test_uniform),
                    r_mean, self.t_env)
            self.logger.log_stat(
                "max_noise_test_ret_u_{:1}".format(test_uniform),
                max_noise_return, self.t_env)

        noise_won = self.noise_test_won
        prefix = "test"
        if test_uniform:
            prefix += "_uni"
        if not test_mode:
            noise_won = self.noise_train_won
            prefix = "train"
        if len(noise_won.keys()) > 0:
            max_test_won = 0
            for n, rs in noise_won.items():
                n_item = n  #int(np.argmax(n))
                r_mean = float(np.mean(rs))
                max_test_won = max(r_mean, max_test_won)
                self.logger.log_stat("{}_noise_{}_won".format(n_item, prefix),
                                     r_mean, self.t_env)
            self.logger.log_stat("max_noise_{}_won".format(prefix),
                                 max_test_won, self.t_env)
        self.noise_returns = {}
        self.noise_test_won = {}
        self.noise_train_won = {}

    def save_models(self, path):
        if self.args.noise_bandit:
            self.noise_distrib.save_model(path)
Ejemplo n.º 3
0
class MetaNoiseRunner:
    def __init__(self, args, logger):
        self.args = args
        self.logger = logger
        self.batch_size = self.args.batch_size_run

        # Make subprocesses for the envs
        self.parent_conns, self.worker_conns = zip(
            *[Pipe() for _ in range(self.batch_size)])
        env_fn = env_REGISTRY[self.args.env]
        self.ps = [
            Process(target=env_worker,
                    args=(worker_conn,
                          CloudpickleWrapper(
                              partial(env_fn, **self.args.env_args))))
            for worker_conn in self.worker_conns
        ]

        for p in self.ps:
            p.daemon = True
            p.start()

        self.parent_conns[0].send(("get_env_info", None))
        self.env_info = self.parent_conns[0].recv()
        self.episode_limit = self.env_info["episode_limit"]

        self.t = 0

        self.t_env = 0

        self.train_returns = []
        self.test_returns = []
        self.train_stats = {}
        self.test_stats = {}

        self.log_train_stats_t = -100000

    def setup(self, scheme, groups, preprocess, mac):
        self.new_batch = partial(EpisodeBatch,
                                 scheme,
                                 groups,
                                 self.batch_size,
                                 self.episode_limit + 1,
                                 preprocess=preprocess,
                                 device=self.args.device)
        self.new_batch_single = partial(EpisodeBatch,
                                        scheme,
                                        groups,
                                        1,
                                        self.episode_limit + 1,
                                        preprocess=preprocess,
                                        device=self.args.device)
        self.mac = mac
        self.scheme = scheme
        self.groups = groups
        self.preprocess = preprocess
        if self.args.noise_bandit:
            if self.args.bandit_policy:
                self.noise_distrib = enza(self.args, logger=self.logger)
            else:
                self.noise_distrib = RBandit(self.args, logger=self.logger)
        else:
            self.noise_distrib = Uniform(self.args)
        self.noise_returns = {}
        self.noise_test_won = {}
        self.noise_train_won = {}

    def get_env_info(self):
        return self.env_info

    def save_replay(self):
        pass

    def close_env(self):
        for parent_conn in self.parent_conns:
            parent_conn.send(("close", None))

    def reset(self, test_mode=False):
        self.batch = self.new_batch()

        # Reset the envs
        for parent_conn in self.parent_conns:
            parent_conn.send(("reset", None))

        pre_transition_data = {"state": [], "avail_actions": [], "obs": []}
        # Get the obs, state and avail_actions back
        for parent_conn in self.parent_conns:
            data = parent_conn.recv()
            pre_transition_data["state"].append(data["state"])
            pre_transition_data["avail_actions"].append(data["avail_actions"])
            pre_transition_data["obs"].append(data["obs"])

        self.batch.update(pre_transition_data, ts=0)
        self.noise = self.noise_distrib.sample(self.batch['state'][:, 0],
                                               test_mode)
        self.batch.update({"noise": self.noise}, ts=0)
        self.t = 0
        self.env_steps_this_run = 0

    def reset_first(self, test_mode=False):
        self.batch = self.new_batch_single()
        self.parent_conns[0].send(("reset", None))
        pre_transition_data = {"state": [], "avail_actions": [], "obs": []}
        data = self.parent_conns[0].recv()
        pre_transition_data["state"].append(data["state"])
        pre_transition_data["avail_actions"].append(data["avail_actions"])
        pre_transition_data["obs"].append(data["obs"])
        self.batch.update(pre_transition_data, ts=0)
        if test_mode or not self.args.noise_bandit:
            self.noise = self.noise_distrib.sample(self.batch['state'][:, 0],
                                                   test_mode)[0:1, :]
        else:
            self.noise = self.noise_distrib.sample(self.batch['state'][:, 0],
                                                   test_mode)
        self.batch.update({"noise": self.noise}, ts=0)
        self.t = 0

    def run(self,
            test_mode=False,
            meta_mode=False,
            test_uniform=False,
            use_rode=False):
        self.reset_first(test_uniform)
        episode_return = 0.0
        episode_length = 0
        if self.args.q_net_ensemble:
            chosen_index = random.randint(0, self.args.ensemble_num - 1)
            chosen_mac = self.mac[chosen_index]
        else:
            chosen_mac = self.mac
        chosen_mac.init_hidden(batch_size=1)
        if self.args.mac == "separate_mac" or self.args.mac == "hierarchical_mac" or self.args.use_roma:
            chosen_mac.init_latent(batch_size=1)
        terminated = False
        final_env_infos = [
        ]  # may store extra stats like battle won. this is filled in ORDER OF TERMINATION
        if meta_mode:
            log_ps = []
        while True:
            if meta_mode:
                action, logp = chosen_mac.select_actions(self.batch,
                                                         t_ep=self.t,
                                                         t_env=self.t_env,
                                                         test_mode=test_mode,
                                                         need_log_p=meta_mode)
                log_ps.append(logp)
            else:
                action = chosen_mac.select_actions(self.batch,
                                                   t_ep=self.t,
                                                   t_env=self.t_env,
                                                   test_mode=test_mode,
                                                   need_log_p=meta_mode)

            cpu_action = action.to("cpu").numpy()

            # Update the actions taken
            action_chosen = {
                "actions": action.unsqueeze(1),
            }
            self.batch.update(action_chosen, ts=self.t, mark_filled=False)

            # Send actions to each env
            if terminated:
                break
            self.parent_conns[0].send(("step", cpu_action[0]))
            post_transition_data = {"reward": [], "terminated": []}
            pre_transition_data = {"state": [], "avail_actions": [], "obs": []}

            data = self.parent_conns[0].recv()
            post_transition_data["reward"].append((data["reward"], ))
            episode_return += data["reward"]
            episode_length += 1
            env_terminated = False
            if data["terminated"] and not data["info"].get(
                    "episode_limit", False):
                env_terminated = True
            terminated = data["terminated"]
            post_transition_data["terminated"].append((env_terminated, ))
            pre_transition_data["state"].append(data["state"])
            pre_transition_data["avail_actions"].append(data["avail_actions"])
            pre_transition_data["obs"].append(data["obs"])
            self.batch.update(post_transition_data,
                              ts=self.t,
                              mark_filled=False)
            self.t += 1
            self.batch.update(pre_transition_data, ts=self.t, mark_filled=True)

        # collect log p for meta policy gradient
        if meta_mode:
            all_log_p = th.cat([it.unsqueeze(1) for it in log_ps[:-1]],
                               dim=1)  #[8*max_ep_len*3]
            batch_log_p = th.sum(all_log_p, [1, 2]) / all_log_p.size(1)
        if not test_mode:
            self.t_env += self.t

        cur_stats = self.test_stats if test_mode else self.train_stats
        cur_returns = self.test_returns if test_mode else self.train_returns
        log_prefix = "test_" if test_mode else ""
        infos = [cur_stats] + final_env_infos
        cur_stats.update({
            k: sum(d.get(k, 0) for d in infos)
            for k in set.union(*[set(d) for d in infos])
        })
        cur_stats["n_episodes"] = 1 + cur_stats.get("n_episodes", 0)
        cur_stats["ep_length"] = episode_length + cur_stats.get("ep_length", 0)

        cur_returns.append(episode_return)
        self._update_noise_returns([episode_return], self.noise,
                                   final_env_infos, test_mode)
        self.noise_distrib.update_returns(self.batch['state'][:, 0],
                                          self.noise, [episode_return],
                                          test_mode, self.t_env)

        n_test_runs = max(1, self.args.test_nepisode)
        if test_mode and (len(self.test_returns) == n_test_runs):
            self._log(cur_returns, cur_stats, log_prefix)
        elif not test_mode and self.t_env - self.log_train_stats_t >= self.args.runner_log_interval:
            self._log(cur_returns, cur_stats, log_prefix)
            if hasattr(chosen_mac.action_selector, "epsilon"):
                self.logger.log_stat("epsilon",
                                     chosen_mac.action_selector.epsilon,
                                     self.t_env)
            self.log_train_stats_t = self.t_env
        final_reward = [episode_return / episode_length
                        ] if self.args.use_step_reward else episode_return
        if meta_mode:
            return self.batch, batch_log_p, final_reward
        else:
            return self.batch, final_reward

    def run_meta(self,
                 test_mode=False,
                 meta_mode=False,
                 test_uniform=False,
                 use_rode=False):
        self.reset(test_uniform)

        all_terminated = False
        episode_returns = [0 for _ in range(self.batch_size)]
        episode_lengths = [0 for _ in range(self.batch_size)]
        if self.args.q_net_ensemble:
            chosen_index = random.randint(0, self.args.ensemble_num - 1)
            chosen_mac = self.mac[chosen_index]
        else:
            chosen_mac = self.mac
        chosen_mac.init_hidden(batch_size=self.batch_size)
        if self.args.mac == "separate_mac" or self.args.mac == "hierarchical_mac" or self.args.use_roma:
            chosen_mac.init_latent(batch_size=self.batch_size)
        terminated = [False for _ in range(self.batch_size)]
        envs_not_terminated = [
            b_idx for b_idx, termed in enumerate(terminated) if not termed
        ]
        final_env_infos = [
        ]  # may store extra stats like battle won. this is filled in ORDER OF TERMINATION
        if meta_mode:
            log_ps = []
        while True:
            if meta_mode:
                actions, logp = chosen_mac.select_actions(
                    self.batch,
                    t_ep=self.t,
                    t_env=self.t_env,
                    bs=envs_not_terminated,
                    test_mode=test_mode,
                    need_log_p=meta_mode)
                log_ps.append(logp)
            else:
                actions = chosen_mac.select_actions(self.batch,
                                                    t_ep=self.t,
                                                    t_env=self.t_env,
                                                    bs=envs_not_terminated,
                                                    test_mode=test_mode,
                                                    need_log_p=meta_mode)
            # Pass the entire batch of experiences up till now to the agents
            # Receive the actions for each agent at this timestep in a batch for each un-terminated env
            cpu_actions = actions.to("cpu").numpy()

            # Update the actions taken

            actions_chosen = {"actions": actions.unsqueeze(1)}
            self.batch.update(actions_chosen,
                              bs=envs_not_terminated,
                              ts=self.t,
                              mark_filled=False)

            # Send actions to each env
            action_idx = 0
            for idx, parent_conn in enumerate(self.parent_conns):
                if idx in envs_not_terminated:  # We produced actions for this env
                    if not terminated[
                            idx]:  # Only send the actions to the env if it hasn't terminated
                        parent_conn.send(("step", cpu_actions[action_idx]))
                    action_idx += 1  # actions is not a list over every env

            # Update envs_not_terminated
            envs_not_terminated = [
                b_idx for b_idx, termed in enumerate(terminated) if not termed
            ]
            all_terminated = all(terminated)
            if all_terminated:
                break

            # Post step data we will insert for the current timestep
            post_transition_data = {"reward": [], "terminated": []}
            # Data for the next step we will insert in order to select an action
            pre_transition_data = {"state": [], "avail_actions": [], "obs": []}

            # Receive data back for each unterminated env
            for idx, parent_conn in enumerate(self.parent_conns):
                if not terminated[idx]:
                    data = parent_conn.recv()
                    # Remaining data for this current timestep
                    post_transition_data["reward"].append((data["reward"], ))

                    episode_returns[idx] += data["reward"]
                    episode_lengths[idx] += 1
                    if not test_mode:
                        self.env_steps_this_run += 1

                    env_terminated = False
                    if data["terminated"]:
                        final_env_infos.append(data["info"])
                    if data["terminated"] and not data["info"].get(
                            "episode_limit", False):
                        env_terminated = True
                    terminated[idx] = data["terminated"]
                    post_transition_data["terminated"].append(
                        (env_terminated, ))

                    # Data for the next timestep needed to select an action
                    pre_transition_data["state"].append(data["state"])
                    pre_transition_data["avail_actions"].append(
                        data["avail_actions"])
                    pre_transition_data["obs"].append(data["obs"])

            # Add post_transiton data into the batch
            self.batch.update(post_transition_data,
                              bs=envs_not_terminated,
                              ts=self.t,
                              mark_filled=False)

            # Move onto the next timestep
            self.t += 1

            # Add the pre-transition data
            self.batch.update(pre_transition_data,
                              bs=envs_not_terminated,
                              ts=self.t,
                              mark_filled=True)

        # collect log p for meta policy gradient
        if meta_mode:
            all_log_p = th.cat([it.unsqueeze(1) for it in log_ps[:-1]],
                               dim=1)  #[8*max_ep_len*3]
            ind = th.zeros(
                [self.batch_size, max(episode_lengths)],
                device=self.batch.device)
            for i in range(self.batch_size):
                ind[i, :episode_lengths[i]] = 1.0
            batch_log_p = th.sum(all_log_p * ind.unsqueeze(2),
                                 [1, 2]) / th.sum(ind, 1)

        if not test_mode:
            self.t_env += self.t

        # Get stats back for each env
        for parent_conn in self.parent_conns:
            parent_conn.send(("get_stats", None))

        env_stats = []
        for parent_conn in self.parent_conns:
            env_stat = parent_conn.recv()
            env_stats.append(env_stat)

        cur_stats = self.test_stats if test_mode else self.train_stats
        cur_returns = self.test_returns if test_mode else self.train_returns
        log_prefix = "test_" if test_mode else ""
        if test_uniform:
            log_prefix += "uni_"
        infos = [cur_stats] + final_env_infos
        cur_stats.update({
            k: sum(d.get(k, 0) for d in infos)
            for k in set.union(*[set(d) for d in infos])
        })
        cur_stats["n_episodes"] = self.batch_size + cur_stats.get(
            "n_episodes", 0)
        cur_stats["ep_length"] = sum(episode_lengths) + cur_stats.get(
            "ep_length", 0)

        cur_returns.extend(episode_returns)

        self._update_noise_returns(episode_returns, self.noise,
                                   final_env_infos, test_mode)
        self.noise_distrib.update_returns(self.batch['state'][:, 0],
                                          self.noise, episode_returns,
                                          test_mode, self.t_env)

        n_test_runs = max(
            1, self.args.test_nepisode // self.batch_size) * self.batch_size
        if test_mode and (len(self.test_returns) == n_test_runs):
            self._log(cur_returns, cur_stats, log_prefix)
        elif not test_mode and self.t_env - self.log_train_stats_t >= self.args.runner_log_interval:
            self._log(cur_returns, cur_stats, log_prefix)
            if hasattr(chosen_mac.action_selector, "epsilon"):
                self.logger.log_stat("epsilon",
                                     chosen_mac.action_selector.epsilon,
                                     self.t_env)
            self.log_train_stats_t = self.t_env
        final_reward = [
            i / j for i, j in zip(episode_returns, episode_lengths)
        ] if self.args.use_step_reward else episode_returns
        if meta_mode:
            return self.batch, batch_log_p, final_reward
        else:
            return self.batch, final_reward

    def get_log_p(self, buffer):
        if self.args.q_net_ensemble:
            chosen_index = random.randint(0, self.args.ensemble_num - 1)
            chosen_mac = self.mac[chosen_index]
        else:
            chosen_mac = self.mac
        chosen_mac.init_hidden(batch_size=buffer.batch_size)
        if self.args.use_roma:
            chosen_mac.init_latent(buffer.batch_size)
        buffer.to(self.batch.device)
        log_ps = []
        terminated = th.zeros(buffer.batch_size, device=self.batch.device)
        ind = th.zeros([buffer.batch_size, buffer.max_seq_length],
                       device=self.batch.device)
        max_ep_len = 0
        for i in range(buffer.max_seq_length):
            envs_not_terminated = [
                b_idx for b_idx, termed in enumerate(terminated)
                if termed < 0.01
            ]
            ra = chosen_mac.select_actions(buffer,
                                           t_ep=i,
                                           t_env=self.t_env,
                                           bs=envs_not_terminated,
                                           test_mode=False,
                                           need_log_p=True)
            log_p = ra[-1]
            if type(log_p) == tuple:
                log_p = log_p[0] + log_p[1]
            log_ps.append(log_p)
            ind[~(terminated.round().to(th.bool)), i] = 1.0
            terminated += buffer["terminated"][:, i, 0]
            if sum(terminated).round().item() == buffer.batch_size:
                max_ep_len = i + 1
                break
        if max_ep_len == 0:
            raise Exception("Some episodes have no 'terminated' mark.")
        ind = ind[:, :max_ep_len]
        all_log_p = th.cat([it.unsqueeze(1) for it in log_ps],
                           dim=1)  #[32*max_ep_len*3]
        # for i in range(self.batch_size):
        #     ind[i, :episode_lengths[i]] = 1.0
        batch_log_p = th.sum(all_log_p * ind.unsqueeze(2), [1, 2]) / th.sum(
            ind, 1)
        return batch_log_p

    def _update_noise_returns(self, returns, noise, stats, test_mode):
        for n, r in zip(noise, returns):
            n = int(np.argmax(n))
            if n in self.noise_returns:
                self.noise_returns[n].append(r)
            else:
                self.noise_returns[n] = [r]
        if test_mode:
            noise_won = self.noise_test_won
        else:
            noise_won = self.noise_train_won

        if stats != [] and "battle_won" in stats[0]:
            for n, info in zip(noise, stats):
                if "battle_won" not in info:
                    continue
                bw = info["battle_won"]
                n = int(np.argmax(n))
                if n in noise_won:
                    noise_won[n].append(bw)
                else:
                    noise_won[n] = [bw]

    def _log_noise_returns(self, test_mode, test_uniform):
        if test_mode:
            max_noise_return = -100000
            for n, rs in self.noise_returns.items():
                n_item = n
                r_mean = float(np.mean(rs))
                max_noise_return = max(r_mean, max_noise_return)
                self.logger.log_stat(
                    "{}_noise_test_ret_u_{:1}".format(n_item, test_uniform),
                    r_mean, self.t_env)
            self.logger.log_stat(
                "max_noise_test_ret_u_{:1}".format(test_uniform),
                max_noise_return, self.t_env)

        noise_won = self.noise_test_won
        prefix = "test"
        if test_uniform:
            prefix += "_uni"
        if not test_mode:
            noise_won = self.noise_train_won
            prefix = "train"
        if len(noise_won.keys()) > 0:
            max_test_won = 0
            for n, rs in noise_won.items():
                n_item = n  #int(np.argmax(n))
                r_mean = float(np.mean(rs))
                max_test_won = max(r_mean, max_test_won)
                self.logger.log_stat("{}_noise_{}_won".format(n_item, prefix),
                                     r_mean, self.t_env)
            self.logger.log_stat("max_noise_{}_won".format(prefix),
                                 max_test_won, self.t_env)
        self.noise_returns = {}
        self.noise_test_won = {}
        self.noise_train_won = {}

    def _log(self, returns, stats, prefix):
        self.logger.log_stat(prefix + "return_mean", np.mean(returns),
                             self.t_env)
        self.logger.log_stat(prefix + "return_std", np.std(returns),
                             self.t_env)
        returns.clear()

        for k, v in stats.items():
            if k != "n_episodes":
                self.logger.log_stat(prefix + k + "_mean",
                                     v / stats["n_episodes"], self.t_env)
        stats.clear()

    def cuda(self):
        if self.args.noise_bandit:
            self.noise_distrib.cuda()