Exemple #1
0
    def __init__(self, args):
        if args.seed is not None:
            torch.manual_seed(args.seed)
            np.random.seed(args.seed)
            random.seed(args.seed)

        self.env_name = args.environment
        self.env_setting = get_env_setting(self.env_name)
        self.solved_reward = self.env_setting["solved_reward"]
        self.update_timestep = self.env_setting["update_timestep"]
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.env = gym.make(args.environment)
        self.alg = PPO(args, self.device)
        self.log_interval = 5  # print avg reward in the interval
        self.max_episodes = 100000
        self.render = False
Exemple #2
0
def main():
    args = get_config()

    # seed
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(args.n_training_threads)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)

    # path
    model_dir = Path('./results') / args.env_name / args.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir))

    # env
    envs = make_parallel_env(args)
    #Policy network
    actor_critic = []
    if args.share_policy:
        ac = Policy(envs.observation_space[0],
                    envs.action_space[0],
                    num_agents=args.num_agents,
                    base_kwargs={
                        'lstm': args.lstm,
                        'naive_recurrent': args.naive_recurrent_policy,
                        'recurrent': args.recurrent_policy,
                        'hidden_size': args.hidden_size
                    })
        ac.to(device)
        for agent_id in range(args.num_agents):
            actor_critic.append(ac)
    else:
        for agent_id in range(args.num_agents):
            ac = Policy(envs.observation_space[0],
                        envs.action_space[0],
                        num_agents=args.num_agents,
                        base_kwargs={
                            'naive_recurrent': args.naive_recurrent_policy,
                            'recurrent': args.recurrent_policy,
                            'hidden_size': args.hidden_size
                        })
            ac.to(device)
            actor_critic.append(ac)

    agents = []
    rollouts = []
    for agent_id in range(args.num_agents):
        # algorithm
        agent = PPO(actor_critic[agent_id],
                    agent_id,
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.data_chunk_length,
                    args.value_loss_coef,
                    args.entropy_coef,
                    logger,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm,
                    use_clipped_value_loss=args.use_clipped_value_loss)
        #replay buffer
        ro = RolloutStorage(args.num_agents, agent_id, args.episode_length,
                            args.n_rollout_threads,
                            envs.observation_space[agent_id],
                            envs.action_space[agent_id],
                            actor_critic[agent_id].recurrent_hidden_state_size)
        agents.append(agent)
        rollouts.append(ro)

    # reset env
    obs = envs.reset()
    # rollout
    for i in range(args.num_agents):
        rollouts[i].share_obs[0].copy_(
            torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
        rollouts[i].obs[0].copy_(torch.tensor(obs[:, i, :]))
        rollouts[i].recurrent_hidden_states.zero_()
        rollouts[i].recurrent_hidden_states_critic.zero_()
        rollouts[i].recurrent_c_states.zero_()
        rollouts[i].recurrent_c_states_critic.zero_()
        rollouts[i].to(device)

    # run
    coop_num = []
    defect_num = []
    coopdefect_num = []
    defectcoop_num = []
    gore1_num = []
    gore2_num = []
    gore3_num = []
    hare1_num = []
    hare2_num = []
    hare3_num = []
    collective_return = []
    apple_consumption = []
    waste_cleared = []
    sustainability = []
    fire = []

    start = time.time()
    episodes = int(
        args.num_env_steps) // args.episode_length // args.n_rollout_threads
    all_episode = 0

    for episode in range(episodes):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            for i in range(args.num_agents):
                update_linear_schedule(agents[i].optimizer, episode, episodes,
                                       args.lr)

        for step in range(args.episode_length):
            # Sample actions
            values = []
            actions = []
            action_log_probs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []
            recurrent_c_statess = []
            recurrent_c_statess_critic = []

            with torch.no_grad():
                for i in range(args.num_agents):
                    value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic,\
                        recurrent_c_states, recurrent_c_states_critic =\
                            actor_critic[i].act(rollouts[i].share_obs[step],
                                                        rollouts[i].obs[step],
                                                        rollouts[i].recurrent_hidden_states[step],
                                                        rollouts[i].recurrent_hidden_states_critic[step],
                                                        rollouts[i].recurrent_c_states[step],
                                                        rollouts[i].recurrent_c_states_critic[step],
                                                        rollouts[i].masks[step])
                    values.append(value)
                    actions.append(action)
                    action_log_probs.append(action_log_prob)
                    recurrent_hidden_statess.append(recurrent_hidden_states)
                    recurrent_hidden_statess_critic.append(
                        recurrent_hidden_states_critic)
                    recurrent_c_statess.append(recurrent_c_states)
                    recurrent_c_statess_critic.append(
                        recurrent_c_states_critic)

            # rearrange action
            actions_env = []
            for i in range(args.n_rollout_threads):
                one_hot_action_env = []
                for k in range(args.num_agents):
                    one_hot_action = np.zeros(envs.action_space[0].n)
                    one_hot_action[actions[k][i]] = 1
                    one_hot_action_env.append(one_hot_action)
                actions_env.append(one_hot_action_env)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(actions_env)

            # If done then clean the history of observations.
            # insert data in buffer
            masks = []
            bad_masks = []
            for i in range(args.num_agents):
                mask = []
                bad_mask = []
                for done_ in done:
                    if done_[i]:
                        mask.append([0.0])
                        bad_mask.append([1.0])
                    else:
                        mask.append([1.0])
                        bad_mask.append([1.0])
                masks.append(torch.FloatTensor(mask))
                bad_masks.append(torch.FloatTensor(bad_mask))

            for i in range(args.num_agents):
                rollouts[i].insert(
                    torch.tensor(obs.reshape(args.n_rollout_threads, -1)),
                    torch.tensor(obs[:, i, :]), recurrent_hidden_statess[i],
                    recurrent_hidden_statess_critic[i], recurrent_c_statess[i],
                    recurrent_c_statess_critic[i], actions[i],
                    action_log_probs[i], values[i],
                    torch.tensor(reward[:,
                                        i].reshape(-1,
                                                   1)), masks[i], bad_masks[i])

        with torch.no_grad():
            next_values = []
            for i in range(args.num_agents):
                next_value = actor_critic[i].get_value(
                    rollouts[i].share_obs[-1], rollouts[i].obs[-1],
                    rollouts[i].recurrent_hidden_states[-1],
                    rollouts[i].recurrent_hidden_states_critic[-1],
                    rollouts[i].recurrent_c_states[-1],
                    rollouts[i].recurrent_c_states_critic[-1],
                    rollouts[i].masks[-1]).detach()
                next_values.append(next_value)

        for i in range(args.num_agents):
            rollouts[i].compute_returns(next_values[i], args.use_gae,
                                        args.gamma, args.gae_lambda,
                                        args.use_proper_time_limits)

        # update the network
        value_losses = []
        action_losses = []
        dist_entropies = []
        for i in range(args.num_agents):
            value_loss, action_loss, dist_entropy = agents[i].update(
                rollouts[i])
            value_losses.append(value_loss)
            action_losses.append(action_loss)
            dist_entropies.append(dist_entropy)

        if args.env_name == "StagHunt":
            for info in infos:
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'defect&defect_num' in info.keys():
                    defect_num.append(info['defect&defect_num'])
                if 'coop&defect_num' in info.keys():
                    coopdefect_num.append(info['coop&defect_num'])
                if 'defect&coop_num' in info.keys():
                    defectcoop_num.append(info['defect&coop_num'])

            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'defect&defect_num_per_episode',
                    {'defect&defect_num_per_episode': defect_num[all_episode]},
                    all_episode)
                logger.add_scalars('coop&defect_num_per_episode', {
                    'coop&defect_num_per_episode':
                    coopdefect_num[all_episode]
                }, all_episode)
                logger.add_scalars('defect&coop_num_per_episode', {
                    'defect&coop_num_per_episode':
                    defectcoop_num[all_episode]
                }, all_episode)
                all_episode += 1
        elif args.env_name == "StagHuntGW":
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'gore1_num' in info.keys():
                    gore1_num.append(info['gore1_num'])
                if 'gore2_num' in info.keys():
                    gore2_num.append(info['gore2_num'])
                if 'hare1_num' in info.keys():
                    hare1_num.append(info['hare1_num'])
                if 'hare2_num' in info.keys():
                    hare2_num.append(info['hare2_num'])
            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'collective_return',
                    {'collective_return': collective_return[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore1_num_per_episode',
                    {'gore1_num_per_episode': gore1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore2_num_per_episode',
                    {'gore2_num_per_episode': gore2_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare1_num_per_episode',
                    {'hare1_num_per_episode': hare1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare2_num_per_episode',
                    {'hare2_num_per_episode': hare2_num[all_episode]},
                    all_episode)
                all_episode += 1
        elif args.env_name == "EscalationGW":
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'collective_return',
                    {'collective_return': collective_return[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                all_episode += 1
        elif args.env_name == "multi_StagHuntGW":
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'gore0_num' in info.keys():
                    gore1_num.append(info['gore0_num'])
                if 'gore1_num' in info.keys():
                    gore2_num.append(info['gore1_num'])
                if 'gore2_num' in info.keys():
                    gore3_num.append(info['gore2_num'])
                if 'hare0_num' in info.keys():
                    hare1_num.append(info['hare0_num'])
                if 'hare1_num' in info.keys():
                    hare2_num.append(info['hare1_num'])
                if 'hare2_num' in info.keys():
                    hare3_num.append(info['hare2_num'])
            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'collective_return',
                    {'collective_return': collective_return[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore1_num_per_episode',
                    {'gore1_num_per_episode': gore1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore2_num_per_episode',
                    {'gore2_num_per_episode': gore2_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore3_num_per_episode',
                    {'gore3_num_per_episode': gore3_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare1_num_per_episode',
                    {'hare1_num_per_episode': hare1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare2_num_per_episode',
                    {'hare2_num_per_episode': hare2_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare3_num_per_episode',
                    {'hare3_num_per_episode': hare3_num[all_episode]},
                    all_episode)
                all_episode += 1

        # clean the buffer and reset
        obs = envs.reset()
        for i in range(args.num_agents):
            rollouts[i].share_obs[0].copy_(
                torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
            rollouts[i].obs[0].copy_(torch.tensor(obs[:, i, :]))
            rollouts[i].recurrent_hidden_states.zero_()
            rollouts[i].recurrent_hidden_states_critic.zero_()
            rollouts[i].recurrent_c_states.zero_()
            rollouts[i].recurrent_c_states_critic.zero_()
            rollouts[i].masks[0].copy_(torch.ones(args.n_rollout_threads, 1))
            rollouts[i].bad_masks[0].copy_(
                torch.ones(args.n_rollout_threads, 1))
            rollouts[i].to(device)

        for i in range(args.num_agents):
            # save for every interval-th episode or for the last epoch
            if (episode % args.save_interval == 0 or episode == episodes - 1):
                torch.save({'model': actor_critic[i]},
                           str(save_dir) + "/agent%i_model" % i + ".pt")

        # log information
        if episode % args.log_interval == 0:
            total_num_steps = (
                episode + 1) * args.episode_length * args.n_rollout_threads
            end = time.time()
            print(
                "\n Updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(episode, episodes, total_num_steps, args.num_env_steps,
                        int(total_num_steps / (end - start))))
            for i in range(args.num_agents):
                print("value loss of agent%i: " % i + str(value_losses[i]))
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()

    ###----------------------------------------------------------###
    ###----------------------------------------------------------###
    ###----------------------------------------------------------###
    if args.eval:
        eval_dir = run_dir / 'eval'
        log_dir = eval_dir / 'logs'
        os.makedirs(str(log_dir))
        logger = SummaryWriter(str(log_dir))

        # eval best policy
        eval_rewards = []
        # env
        if args.env_name == "StagHunt":
            assert args.num_agents == 2, (
                "only 2 agents is supported, check the config.py.")
            env = MGEnv(args)
        elif args.env_name == "StagHuntGW" or args.env_name == "EscalationGW":
            assert args.num_agents == 2, (
                "only 2 agent is supported in single navigation, check the config.py."
            )
            env = GridWorldEnv(args)
        elif args.env_name == "multi_StagHuntGW":
            env = multi_GridWorldEnv(args)
        else:
            print("Can not support the " + args.env_name + "environment.")
            raise NotImplementedError

        #Policy network
        coop_num = []
        defect_num = []
        coopdefect_num = []
        defectcoop_num = []
        gore1_num = []
        gore2_num = []
        gore3_num = []
        hare1_num = []
        hare2_num = []
        hare3_num = []
        collective_return = []
        apple_consumption = []
        waste_cleared = []
        sustainability = []
        fire = []

        for episode in range(args.eval_episodes):
            print("Episode %i of %i" % (episode, args.eval_episodes))
            state = env.reset()
            state = np.array([state])

            share_obs = []
            obs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []
            recurrent_c_statess = []
            recurrent_c_statess_critic = []
            masks = []
            policy_reward = 0

            # rollout
            for i in range(args.num_agents):
                share_obs.append(
                    (torch.tensor(state.reshape(1, -1),
                                  dtype=torch.float32)).to(device))
                obs.append((torch.tensor(state[:, i, :],
                                         dtype=torch.float32)).to(device))
                recurrent_hidden_statess.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                recurrent_hidden_statess_critic.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                recurrent_c_statess.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                recurrent_c_statess_critic.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                masks.append(torch.ones(1, 1).to(device))

            for step in range(args.episode_length):
                print("step %i of %i" % (step, args.episode_length))
                # Sample actions
                one_hot_actions = []
                for i in range(args.num_agents):
                    one_hot_action = np.zeros(env.action_space[0].n)
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic, recurrent_c_states, recurrent_c_states_critic = actor_critic[
                            i].act(share_obs[i], obs[i],
                                   recurrent_hidden_statess[i],
                                   recurrent_hidden_statess_critic[i],
                                   recurrent_c_statess[i],
                                   recurrent_c_statess_critic[i], masks[i])
                    recurrent_hidden_statess[i].copy_(recurrent_hidden_states)
                    recurrent_hidden_statess_critic[i].copy_(
                        recurrent_hidden_states_critic)
                    recurrent_c_statess[i].copy_(recurrent_c_states)
                    recurrent_c_statess_critic[i].copy_(
                        recurrent_c_states_critic)
                    one_hot_action[action] = 1
                    one_hot_actions.append(one_hot_action)

                # Obser reward and next obs
                state, reward, done, infos = env.step(one_hot_actions)

                for i in range(args.num_agents):
                    print("Reward of agent%i: " % i + str(reward[i]))
                    policy_reward += reward[i]

                if all(done):
                    break

                state = np.array([state])

                for i in range(args.num_agents):
                    if len(env.observation_space[0]) == 1:
                        share_obs[i].copy_(
                            torch.tensor(state.reshape(1, -1),
                                         dtype=torch.float32))
                        obs[i].copy_(
                            torch.tensor(state[:, i, :], dtype=torch.float32))
                    elif len(env.observation_space[0]) == 3:
                        share_obs[i].copy_(
                            torch.tensor(state.reshape(
                                1, -1, env.observation_space[0][1],
                                env.observation_space[0][2]),
                                         dtype=torch.float32))
                        obs[i].copy_(
                            torch.tensor(state[:, i, :, :, :],
                                         dtype=torch.float32))

            eval_rewards.append(policy_reward)

            if args.env_name == "StagHunt":
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                if 'defect&defect_num' in infos.keys():
                    defect_num.append(infos['defect&defect_num'])
                if 'coop&defect_num' in infos.keys():
                    coopdefect_num.append(infos['coop&defect_num'])
                if 'defect&coop_num' in infos.keys():
                    defectcoop_num.append(infos['defect&coop_num'])

                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[episode]}, episode)
                logger.add_scalars(
                    'defect&defect_num_per_episode',
                    {'defect&defect_num_per_episode': defect_num[episode]},
                    episode)
                logger.add_scalars(
                    'coop&defect_num_per_episode',
                    {'coop&defect_num_per_episode': coopdefect_num[episode]},
                    episode)
                logger.add_scalars(
                    'defect&coop_num_per_episode',
                    {'defect&coop_num_per_episode': defectcoop_num[episode]},
                    episode)

            elif args.env_name == "StagHuntGW":
                if 'collective_return' in infos.keys():
                    collective_return.append(infos['collective_return'])
                    logger.add_scalars(
                        'collective_return',
                        {'collective_return': collective_return[episode]},
                        episode)
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                    logger.add_scalars(
                        'coop&coop_num_per_episode',
                        {'coop&coop_num_per_episode': coop_num[episode]},
                        episode)
                if 'gore1_num' in infos.keys():
                    gore1_num.append(infos['gore1_num'])
                    logger.add_scalars(
                        'gore1_num_per_episode',
                        {'gore1_num_per_episode': gore1_num[episode]}, episode)
                if 'gore2_num' in infos.keys():
                    gore2_num.append(infos['gore2_num'])
                    logger.add_scalars(
                        'gore2_num_per_episode',
                        {'gore2_num_per_episode': gore2_num[episode]}, episode)
                if 'hare1_num' in infos.keys():
                    hare1_num.append(infos['hare1_num'])
                    logger.add_scalars(
                        'hare1_num_per_episode',
                        {'hare1_num_per_episode': hare1_num[episode]}, episode)
                if 'hare2_num' in infos.keys():
                    hare2_num.append(infos['hare2_num'])
                    logger.add_scalars(
                        'hare2_num_per_episode',
                        {'hare2_num_per_episode': hare2_num[episode]}, episode)
            elif args.env_name == "EscalationGW":
                if 'collective_return' in infos.keys():
                    collective_return.append(infos['collective_return'])
                    logger.add_scalars(
                        'collective_return',
                        {'collective_return': collective_return[episode]},
                        episode)
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                    logger.add_scalars(
                        'coop&coop_num_per_episode',
                        {'coop&coop_num_per_episode': coop_num[episode]},
                        episode)
            elif args.env_name == "multi_StagHuntGW":
                if 'collective_return' in infos.keys():
                    collective_return.append(infos['collective_return'])
                    logger.add_scalars(
                        'collective_return',
                        {'collective_return': collective_return[episode]},
                        episode)
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                    logger.add_scalars(
                        'coop&coop_num_per_episode',
                        {'coop&coop_num_per_episode': coop_num[episode]},
                        episode)
                if 'gore0_num' in infos.keys():
                    gore1_num.append(infos['gore0_num'])
                    logger.add_scalars(
                        'gore1_num_per_episode',
                        {'gore1_num_per_episode': gore1_num[episode]}, episode)
                if 'gore1_num' in infos.keys():
                    gore2_num.append(infos['gore1_num'])
                    logger.add_scalars(
                        'gore2_num_per_episode',
                        {'gore2_num_per_episode': gore2_num[episode]}, episode)
                if 'gore2_num' in infos.keys():
                    gore3_num.append(infos['gore2_num'])
                    logger.add_scalars(
                        'gore3_num_per_episode',
                        {'gore3_num_per_episode': gore3_num[episode]}, episode)
                if 'hare0_num' in infos.keys():
                    hare1_num.append(infos['hare0_num'])
                    logger.add_scalars(
                        'hare1_num_per_episode',
                        {'hare1_num_per_episode': hare1_num[episode]}, episode)
                if 'hare1_num' in infos.keys():
                    hare2_num.append(infos['hare1_num'])
                    logger.add_scalars(
                        'hare2_num_per_episode',
                        {'hare2_num_per_episode': hare2_num[episode]}, episode)
                if 'hare2_num' in infos.keys():
                    hare3_num.append(infos['hare2_num'])
                    logger.add_scalars(
                        'hare3_num_per_episode',
                        {'hare3_num_per_episode': hare3_num[episode]}, episode)
        logger.export_scalars_to_json(str(log_dir / 'summary.json'))
        logger.close()
def main():
    args = get_config()

    # seed
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(args.n_training_threads)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)

    # path
    model_dir = Path(
        './results') / args.env_name / args.scenario_name / args.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir))

    # env
    envs = make_parallel_env(args)
    if args.eval:
        eval_env = make_eval_env(args)

    num_agents = args.num_agents
    all_action_space = []
    all_obs_space = []
    action_movement_dim = []
    if args.env_name == "BlueprintConstruction":
        order_obs = [
            'agent_qpos_qvel', 'box_obs', 'ramp_obs', 'construction_site_obs',
            'observation_self'
        ]
        mask_order_obs = [None, None, None, None, None]
    elif args.env_name == "BoxLocking":
        order_obs = [
            'agent_qpos_qvel', 'box_obs', 'ramp_obs', 'observation_self'
        ]
        mask_order_obs = ['mask_aa_obs', 'mask_ab_obs', 'mask_ar_obs', None]
    else:
        print("Can not support the " + args.env_name + "environment.")
        raise NotImplementedError

    for agent_id in range(num_agents):
        # deal with dict action space
        action_movement = envs.action_space['action_movement'][agent_id].nvec
        action_movement_dim.append(len(action_movement))
        action_glueall = envs.action_space['action_glueall'][agent_id].n
        action_vec = np.append(action_movement, action_glueall)
        if 'action_pull' in envs.action_space.spaces.keys():
            action_pull = envs.action_space['action_pull'][agent_id].n
            action_vec = np.append(action_vec, action_pull)
        action_space = MultiDiscrete([[0, vec - 1] for vec in action_vec])
        all_action_space.append(action_space)
        # deal with dict obs space
        obs_space = []
        obs_dim = 0
        for key in order_obs:
            if key in envs.observation_space.spaces.keys():
                space = list(envs.observation_space[key].shape)
                if len(space) < 2:
                    space.insert(0, 1)
                obs_space.append(space)
                obs_dim += reduce(lambda x, y: x * y, space)
        obs_space.insert(0, obs_dim)
        all_obs_space.append(obs_space)

    if args.share_policy:
        actor_critic = Policy(all_obs_space[0],
                              all_action_space[0],
                              num_agents=num_agents,
                              gain=args.gain,
                              base_kwargs={
                                  'naive_recurrent':
                                  args.naive_recurrent_policy,
                                  'recurrent': args.recurrent_policy,
                                  'hidden_size': args.hidden_size,
                                  'recurrent_N': args.recurrent_N,
                                  'attn': args.attn,
                                  'attn_only_critic': args.attn_only_critic,
                                  'attn_size': args.attn_size,
                                  'attn_N': args.attn_N,
                                  'attn_heads': args.attn_heads,
                                  'dropout': args.dropout,
                                  'use_average_pool': args.use_average_pool,
                                  'use_common_layer': args.use_common_layer,
                                  'use_feature_normlization':
                                  args.use_feature_normlization,
                                  'use_feature_popart':
                                  args.use_feature_popart,
                                  'use_orthogonal': args.use_orthogonal,
                                  'layer_N': args.layer_N,
                                  'use_ReLU': args.use_ReLU,
                                  'use_same_dim': True
                              },
                              device=device)
        actor_critic.to(device)
        # algorithm
        agents = PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.data_chunk_length,
                     args.value_loss_coef,
                     args.entropy_coef,
                     logger,
                     lr=args.lr,
                     eps=args.eps,
                     weight_decay=args.weight_decay,
                     max_grad_norm=args.max_grad_norm,
                     use_max_grad_norm=args.use_max_grad_norm,
                     use_clipped_value_loss=args.use_clipped_value_loss,
                     use_common_layer=args.use_common_layer,
                     use_huber_loss=args.use_huber_loss,
                     huber_delta=args.huber_delta,
                     use_popart=args.use_popart,
                     use_value_high_masks=args.use_value_high_masks,
                     device=device)

        #replay buffer
        rollouts = RolloutStorage(num_agents,
                                  args.episode_length,
                                  args.n_rollout_threads,
                                  all_obs_space[0],
                                  all_action_space[0],
                                  args.hidden_size,
                                  use_same_dim=True)
    else:
        actor_critic = []
        agents = []
        for agent_id in range(num_agents):
            ac = Policy(all_obs_space[0],
                        all_action_space[0],
                        num_agents=num_agents,
                        gain=args.gain,
                        base_kwargs={
                            'naive_recurrent': args.naive_recurrent_policy,
                            'recurrent': args.recurrent_policy,
                            'hidden_size': args.hidden_size,
                            'recurrent_N': args.recurrent_N,
                            'attn': args.attn,
                            'attn_only_critic': args.attn_only_critic,
                            'attn_size': args.attn_size,
                            'attn_N': args.attn_N,
                            'attn_heads': args.attn_heads,
                            'dropout': args.dropout,
                            'use_average_pool': args.use_average_pool,
                            'use_common_layer': args.use_common_layer,
                            'use_feature_normlization':
                            args.use_feature_normlization,
                            'use_feature_popart': args.use_feature_popart,
                            'use_orthogonal': args.use_orthogonal,
                            'layer_N': args.layer_N,
                            'use_ReLU': args.use_ReLU,
                            'use_same_dim': True
                        },
                        device=device)
            ac.to(device)
            # algorithm
            agent = PPO(ac,
                        args.clip_param,
                        args.ppo_epoch,
                        args.num_mini_batch,
                        args.data_chunk_length,
                        args.value_loss_coef,
                        args.entropy_coef,
                        logger,
                        lr=args.lr,
                        eps=args.eps,
                        weight_decay=args.weight_decay,
                        max_grad_norm=args.max_grad_norm,
                        use_max_grad_norm=args.use_max_grad_norm,
                        use_clipped_value_loss=args.use_clipped_value_loss,
                        use_common_layer=args.use_common_layer,
                        use_huber_loss=args.use_huber_loss,
                        huber_delta=args.huber_delta,
                        use_popart=args.use_popart,
                        use_value_high_masks=args.use_value_high_masks,
                        device=device)

            actor_critic.append(ac)
            agents.append(agent)

        #replay buffer
        rollouts = RolloutStorage(num_agents,
                                  args.episode_length,
                                  args.n_rollout_threads,
                                  all_obs_space[0],
                                  all_action_space[0],
                                  args.hidden_size,
                                  use_same_dim=True)

    # reset env
    dict_obs = envs.reset()
    obs = []
    share_obs = []
    for d_o in dict_obs:
        for i, key in enumerate(order_obs):
            if key in envs.observation_space.spaces.keys():
                if mask_order_obs[i] == None:
                    temp_share_obs = d_o[key].reshape(num_agents, -1).copy()
                    temp_obs = temp_share_obs.copy()
                else:
                    temp_share_obs = d_o[key].reshape(num_agents, -1).copy()
                    temp_mask = d_o[mask_order_obs[i]].copy()
                    temp_obs = d_o[key].copy()
                    mins_temp_mask = ~temp_mask
                    temp_obs[mins_temp_mask] = np.zeros(
                        (mins_temp_mask.sum(), temp_obs.shape[2]))
                    temp_obs = temp_obs.reshape(num_agents, -1)
                if i == 0:
                    reshape_obs = temp_obs.copy()
                    reshape_share_obs = temp_share_obs.copy()
                else:
                    reshape_obs = np.concatenate((reshape_obs, temp_obs),
                                                 axis=1)
                    reshape_share_obs = np.concatenate(
                        (reshape_share_obs, temp_share_obs), axis=1)
        obs.append(reshape_obs)
        share_obs.append(reshape_share_obs)
    obs = np.array(obs)
    share_obs = np.array(share_obs)

    # replay buffer
    rollouts.share_obs[0] = share_obs.copy()
    rollouts.obs[0] = obs.copy()
    rollouts.recurrent_hidden_states = np.zeros(
        rollouts.recurrent_hidden_states.shape).astype(np.float32)
    rollouts.recurrent_hidden_states_critic = np.zeros(
        rollouts.recurrent_hidden_states_critic.shape).astype(np.float32)

    # run
    start = time.time()
    episodes = int(
        args.num_env_steps) // args.episode_length // args.n_rollout_threads
    timesteps = 0

    for episode in range(episodes):
        if args.use_linear_lr_decay:  # decrease learning rate linearly
            if args.share_policy:
                update_linear_schedule(agents.optimizer, episode, episodes,
                                       args.lr)
            else:
                for agent_id in range(num_agents):
                    update_linear_schedule(agents[agent_id].optimizer, episode,
                                           episodes, args.lr)
        # info list
        discard_episode = 0
        success = 0
        trials = 0

        for step in range(args.episode_length):
            # Sample actions
            values = []
            actions = []
            action_log_probs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []
            with torch.no_grad():
                for agent_id in range(num_agents):
                    if args.share_policy:
                        actor_critic.eval()
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic.act(
                            agent_id,
                            torch.tensor(rollouts.share_obs[step, :,
                                                            agent_id]),
                            torch.tensor(rollouts.obs[step, :, agent_id]),
                            torch.tensor(
                                rollouts.recurrent_hidden_states[step, :,
                                                                 agent_id]),
                            torch.tensor(
                                rollouts.recurrent_hidden_states_critic[
                                    step, :, agent_id]),
                            torch.tensor(rollouts.masks[step, :, agent_id]))
                    else:
                        actor_critic[agent_id].eval()
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic[
                            agent_id].act(
                                agent_id,
                                torch.tensor(rollouts.share_obs[step, :,
                                                                agent_id]),
                                torch.tensor(rollouts.obs[step, :, agent_id]),
                                torch.tensor(rollouts.recurrent_hidden_states[
                                    step, :, agent_id]),
                                torch.tensor(
                                    rollouts.recurrent_hidden_states_critic[
                                        step, :, agent_id]),
                                torch.tensor(rollouts.masks[step, :,
                                                            agent_id]))

                    values.append(value.detach().cpu().numpy())
                    actions.append(action.detach().cpu().numpy())
                    action_log_probs.append(
                        action_log_prob.detach().cpu().numpy())
                    recurrent_hidden_statess.append(
                        recurrent_hidden_states.detach().cpu().numpy())
                    recurrent_hidden_statess_critic.append(
                        recurrent_hidden_states_critic.detach().cpu().numpy())

            # rearrange action
            actions_env = []
            for n_rollout_thread in range(args.n_rollout_threads):
                action_movement = []
                action_pull = []
                action_glueall = []
                for agent_id in range(num_agents):
                    action_movement.append(actions[agent_id][n_rollout_thread]
                                           [:action_movement_dim[agent_id]])
                    action_glueall.append(
                        int(actions[agent_id][n_rollout_thread][
                            action_movement_dim[agent_id]]))
                    if 'action_pull' in envs.action_space.spaces.keys():
                        action_pull.append(
                            int(actions[agent_id][n_rollout_thread][-1]))
                action_movement = np.stack(action_movement, axis=0)
                action_glueall = np.stack(action_glueall, axis=0)
                if 'action_pull' in envs.action_space.spaces.keys():
                    action_pull = np.stack(action_pull, axis=0)
                one_env_action = {
                    'action_movement': action_movement,
                    'action_pull': action_pull,
                    'action_glueall': action_glueall
                }
                actions_env.append(one_env_action)

            # Obser reward and next obs
            dict_obs, rewards, dones, infos = envs.step(actions_env)
            if len(rewards.shape) < 3:
                rewards = rewards[:, :, np.newaxis]

            # If done then clean the history of observations.
            # insert data in buffer
            masks = []
            for i, done in enumerate(dones):
                if done:
                    if "discard_episode" in infos[i].keys():
                        if infos[i]['discard_episode']:
                            discard_episode += 1
                        else:
                            trials += 1
                    else:
                        trials += 1
                    if "success" in infos[i].keys():
                        if infos[i]['success']:
                            success += 1
                mask = []
                for agent_id in range(num_agents):
                    if done:
                        recurrent_hidden_statess[agent_id][i] = np.zeros(
                            args.hidden_size).astype(np.float32)
                        recurrent_hidden_statess_critic[agent_id][
                            i] = np.zeros(args.hidden_size).astype(np.float32)
                        mask.append([0.0])
                    else:
                        mask.append([1.0])
                masks.append(mask)

            obs = []
            share_obs = []
            for d_o in dict_obs:
                for i, key in enumerate(order_obs):
                    if key in envs.observation_space.spaces.keys():
                        if mask_order_obs[i] == None:
                            temp_share_obs = d_o[key].reshape(num_agents,
                                                              -1).copy()
                            temp_obs = temp_share_obs.copy()
                        else:
                            temp_share_obs = d_o[key].reshape(num_agents,
                                                              -1).copy()
                            temp_mask = d_o[mask_order_obs[i]].copy()
                            temp_obs = d_o[key].copy()
                            mins_temp_mask = ~temp_mask
                            temp_obs[mins_temp_mask] = np.zeros(
                                (mins_temp_mask.sum(), temp_obs.shape[2]))
                            temp_obs = temp_obs.reshape(num_agents, -1)
                        if i == 0:
                            reshape_obs = temp_obs.copy()
                            reshape_share_obs = temp_share_obs.copy()
                        else:
                            reshape_obs = np.concatenate(
                                (reshape_obs, temp_obs), axis=1)
                            reshape_share_obs = np.concatenate(
                                (reshape_share_obs, temp_share_obs), axis=1)
                obs.append(reshape_obs)
                share_obs.append(reshape_share_obs)
            obs = np.array(obs)
            share_obs = np.array(share_obs)

            rollouts.insert(
                share_obs, obs,
                np.array(recurrent_hidden_statess).transpose(1, 0, 2),
                np.array(recurrent_hidden_statess_critic).transpose(1, 0, 2),
                np.array(actions).transpose(1, 0, 2),
                np.array(action_log_probs).transpose(1, 0, 2),
                np.array(values).transpose(1, 0, 2), rewards, masks)

        with torch.no_grad():
            for agent_id in range(num_agents):
                if args.share_policy:
                    actor_critic.eval()
                    next_value, _, _ = actor_critic.get_value(
                        agent_id,
                        torch.tensor(rollouts.share_obs[-1, :, agent_id]),
                        torch.tensor(rollouts.obs[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states_critic[-1, :,
                                                                    agent_id]),
                        torch.tensor(rollouts.masks[-1, :, agent_id]))
                    next_value = next_value.detach().cpu().numpy()
                    rollouts.compute_returns(agent_id, next_value,
                                             args.use_gae, args.gamma,
                                             args.gae_lambda,
                                             args.use_proper_time_limits,
                                             args.use_popart,
                                             agents.value_normalizer)
                else:
                    actor_critic[agent_id].eval()
                    next_value, _, _ = actor_critic[agent_id].get_value(
                        agent_id,
                        torch.tensor(rollouts.share_obs[-1, :, agent_id]),
                        torch.tensor(rollouts.obs[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states_critic[-1, :,
                                                                    agent_id]),
                        torch.tensor(rollouts.masks[-1, :, agent_id]))
                    next_value = next_value.detach().cpu().numpy()
                    rollouts.compute_returns(agent_id, next_value,
                                             args.use_gae, args.gamma,
                                             args.gae_lambda,
                                             args.use_proper_time_limits,
                                             args.use_popart,
                                             agents[agent_id].value_normalizer)

        # update the network
        if args.share_policy:
            actor_critic.train()
            value_loss, action_loss, dist_entropy = agents.update_share(
                num_agents, rollouts)

            logger.add_scalars('reward', {'reward': np.mean(rollouts.rewards)},
                               (episode + 1) * args.episode_length *
                               args.n_rollout_threads)
        else:
            value_losses = []
            action_losses = []
            dist_entropies = []

            for agent_id in range(num_agents):
                actor_critic[agent_id].train()
                value_loss, action_loss, dist_entropy = agents[
                    agent_id].update(agent_id, rollouts)
                value_losses.append(value_loss)
                action_losses.append(action_loss)
                dist_entropies.append(dist_entropy)

                logger.add_scalars(
                    'agent%i/reward' % agent_id,
                    {'reward': np.mean(rollouts.rewards[:, :, agent_id])},
                    (episode + 1) * args.episode_length *
                    args.n_rollout_threads)

        # clean the buffer and reset
        rollouts.after_update()

        total_num_steps = (episode +
                           1) * args.episode_length * args.n_rollout_threads

        if (episode % args.save_interval == 0 or episode == episodes -
                1):  # save for every interval-th episode or for the last epoch
            if args.share_policy:
                torch.save({'model': actor_critic},
                           str(save_dir) + "/agent_model.pt")
            else:
                for agent_id in range(num_agents):
                    torch.save({'model': actor_critic[agent_id]},
                               str(save_dir) + "/agent%i_model" % agent_id +
                               ".pt")

        # log information
        if episode % args.log_interval == 0:
            end = time.time()
            print(
                "\n Scenario {} Algo {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(args.scenario_name, args.algorithm_name, episode,
                        episodes, total_num_steps, args.num_env_steps,
                        int(total_num_steps / (end - start))))
            if args.share_policy:
                print("value loss of agent: " + str(value_loss))
            else:
                for agent_id in range(num_agents):
                    print("value loss of agent%i: " % agent_id +
                          str(value_losses[agent_id]))

            logger.add_scalars('discard_episode',
                               {'discard_episode': discard_episode},
                               total_num_steps)
            if trials > 0:
                logger.add_scalars('success_rate',
                                   {'success_rate': success / trials},
                                   total_num_steps)
            else:
                logger.add_scalars('success_rate', {'success_rate': 0.0},
                                   total_num_steps)
        # eval
        if episode % args.eval_interval == 0 and args.eval:
            eval_episode = 0
            eval_success = 0
            eval_dict_obs = eval_env.reset()

            eval_obs = []
            eval_share_obs = []
            for eval_d_o in eval_dict_obs:
                for i, key in enumerate(order_obs):
                    if key in eval_env.observation_space.spaces.keys():
                        if mask_order_obs[i] == None:
                            temp_share_obs = eval_d_o[key].reshape(
                                num_agents, -1).copy()
                            temp_obs = temp_share_obs.copy()
                        else:
                            temp_share_obs = eval_d_o[key].reshape(
                                num_agents, -1).copy()
                            temp_mask = eval_d_o[mask_order_obs[i]].copy()
                            temp_obs = eval_d_o[key].copy()
                            mins_temp_mask = ~temp_mask
                            temp_obs[mins_temp_mask] = np.zeros(
                                (mins_temp_mask.sum(), temp_obs.shape[2]))
                            temp_obs = temp_obs.reshape(num_agents, -1)
                        if i == 0:
                            reshape_obs = temp_obs.copy()
                            reshape_share_obs = temp_share_obs.copy()
                        else:
                            reshape_obs = np.concatenate(
                                (reshape_obs, temp_obs), axis=1)
                            reshape_share_obs = np.concatenate(
                                (reshape_share_obs, temp_share_obs), axis=1)
                eval_obs.append(reshape_obs)
                eval_share_obs.append(reshape_share_obs)
            eval_obs = np.array(eval_obs)
            eval_share_obs = np.array(eval_share_obs)

            eval_recurrent_hidden_states = np.zeros(
                (1, num_agents, args.hidden_size)).astype(np.float32)
            eval_recurrent_hidden_states_critic = np.zeros(
                (1, num_agents, args.hidden_size)).astype(np.float32)
            eval_masks = np.ones((1, num_agents, 1)).astype(np.float32)

            while True:
                eval_actions = []
                actor_critic.eval()
                for agent_id in range(num_agents):
                    _, action, _, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic.act(
                        agent_id,
                        torch.FloatTensor(eval_share_obs[:, agent_id]),
                        torch.FloatTensor(eval_obs[:, agent_id]),
                        torch.FloatTensor(
                            eval_recurrent_hidden_states[:, agent_id]),
                        torch.FloatTensor(
                            eval_recurrent_hidden_states_critic[:, agent_id]),
                        torch.FloatTensor(eval_masks[:, agent_id]),
                        None,
                        deterministic=True)

                    eval_actions.append(action.detach().cpu().numpy())
                    eval_recurrent_hidden_states[:,
                                                 agent_id] = recurrent_hidden_states.detach(
                                                 ).cpu().numpy()
                    eval_recurrent_hidden_states_critic[:,
                                                        agent_id] = recurrent_hidden_states_critic.detach(
                                                        ).cpu().numpy()

                # rearrange action
                eval_actions_env = []
                for n_rollout_thread in range(1):
                    action_movement = []
                    action_pull = []
                    action_glueall = []
                    for agent_id in range(num_agents):
                        action_movement.append(
                            eval_actions[agent_id][n_rollout_thread]
                            [:action_movement_dim[agent_id]])
                        action_glueall.append(
                            int(eval_actions[agent_id][n_rollout_thread][
                                action_movement_dim[agent_id]]))
                        if 'action_pull' in envs.action_space.spaces.keys():
                            action_pull.append(
                                int(eval_actions[agent_id][n_rollout_thread]
                                    [-1]))
                    action_movement = np.stack(action_movement, axis=0)
                    action_glueall = np.stack(action_glueall, axis=0)
                    if 'action_pull' in envs.action_space.spaces.keys():
                        action_pull = np.stack(action_pull, axis=0)
                    one_env_action = {
                        'action_movement': action_movement,
                        'action_pull': action_pull,
                        'action_glueall': action_glueall
                    }
                    eval_actions_env.append(one_env_action)

                # Obser reward and next obs
                eval_dict_obs, eval_rewards, eval_dones, eval_infos = eval_env.step(
                    eval_actions_env)

                eval_obs = []
                eval_share_obs = []
                for eval_d_o in eval_dict_obs:
                    for i, key in enumerate(order_obs):
                        if key in eval_env.observation_space.spaces.keys():
                            if mask_order_obs[i] == None:
                                temp_share_obs = eval_d_o[key].reshape(
                                    num_agents, -1).copy()
                                temp_obs = temp_share_obs.copy()
                            else:
                                temp_share_obs = eval_d_o[key].reshape(
                                    num_agents, -1).copy()
                                temp_mask = eval_d_o[mask_order_obs[i]].copy()
                                temp_obs = eval_d_o[key].copy()
                                mins_temp_mask = ~temp_mask
                                temp_obs[mins_temp_mask] = np.zeros(
                                    (mins_temp_mask.sum(), temp_obs.shape[2]))
                                temp_obs = temp_obs.reshape(num_agents, -1)
                            if i == 0:
                                reshape_obs = temp_obs.copy()
                                reshape_share_obs = temp_share_obs.copy()
                            else:
                                reshape_obs = np.concatenate(
                                    (reshape_obs, temp_obs), axis=1)
                                reshape_share_obs = np.concatenate(
                                    (reshape_share_obs, temp_share_obs),
                                    axis=1)
                    eval_obs.append(reshape_obs)
                    eval_share_obs.append(reshape_share_obs)
                eval_obs = np.array(eval_obs)
                eval_share_obs = np.array(eval_share_obs)

                eval_recurrent_hidden_states = np.zeros(
                    (1, num_agents, args.hidden_size)).astype(np.float32)
                eval_recurrent_hidden_states_critic = np.zeros(
                    (1, num_agents, args.hidden_size)).astype(np.float32)
                eval_masks = np.ones((1, num_agents, 1)).astype(np.float32)

                if eval_dones[0]:
                    eval_episode += 1
                    if "success" in eval_infos[0].keys():
                        if eval_infos[0]['success']:
                            eval_success += 1
                    for agent_id in range(num_agents):
                        eval_recurrent_hidden_states[0][agent_id] = np.zeros(
                            args.hidden_size).astype(np.float32)
                        eval_recurrent_hidden_states_critic[0][
                            agent_id] = np.zeros(args.hidden_size).astype(
                                np.float32)
                        eval_masks[0][agent_id] = 0.0
                else:
                    for agent_id in range(num_agents):
                        eval_masks[0][agent_id] = 1.0

                if eval_episode >= args.eval_episodes:
                    logger.add_scalars('eval_success_rate', {
                        'eval_success_rate':
                        eval_success / args.eval_episodes
                    }, total_num_steps)
                    print("eval_success_rate is " +
                          str(eval_success / args.eval_episodes))
                    break

    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    envs.close()
    if args.eval:
        eval_env.close()
Exemple #4
0
def main():
    args = get_config()

    assert (
        args.share_policy == True
        and args.scenario_name == 'simple_speaker_listener'
    ) == False, (
        "The simple_speaker_listener scenario can not use shared policy. Please check the config.py."
    )

    # seed
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(1)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)

    # path
    model_dir = Path(
        './results') / args.env_name / args.scenario_name / args.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir))

    # env
    envs = make_parallel_env(args)
    num_agents = args.num_agents
    #Policy network
    if args.share_policy:
        share_base = ATTBase_add(envs.observation_space[0].shape[0],
                                 num_agents)
        actor_critic = Policy(envs.observation_space[0],
                              envs.action_space[0],
                              num_agents=num_agents,
                              base=share_base,
                              base_kwargs={
                                  'naive_recurrent':
                                  args.naive_recurrent_policy,
                                  'recurrent': args.recurrent_policy,
                                  'hidden_size': args.hidden_size,
                                  'attn': args.attn,
                                  'attn_size': args.attn_size,
                                  'attn_N': args.attn_N,
                                  'attn_heads': args.attn_heads,
                                  'dropout': args.dropout,
                                  'use_average_pool': args.use_average_pool,
                                  'use_common_layer': args.use_common_layer,
                                  'use_feature_normlization':
                                  args.use_feature_normlization,
                                  'use_feature_popart':
                                  args.use_feature_popart,
                                  'use_orthogonal': args.use_orthogonal,
                                  'layer_N': args.layer_N,
                                  'use_ReLU': args.use_ReLU
                              },
                              device=device)
        actor_critic.to(device)
        # algorithm
        agents = PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.data_chunk_length,
                     args.value_loss_coef,
                     args.entropy_coef,
                     logger,
                     lr=args.lr,
                     eps=args.eps,
                     weight_decay=args.weight_decay,
                     max_grad_norm=args.max_grad_norm,
                     use_max_grad_norm=args.use_max_grad_norm,
                     use_clipped_value_loss=args.use_clipped_value_loss,
                     use_common_layer=args.use_common_layer,
                     use_huber_loss=args.use_huber_loss,
                     huber_delta=args.huber_delta,
                     use_popart=args.use_popart,
                     device=device)

        #replay buffer
        rollouts = RolloutStorage_share(num_agents, args.episode_length,
                                        args.n_rollout_threads,
                                        envs.observation_space[0],
                                        envs.action_space[0], args.hidden_size)
    else:
        actor_critic = []
        agents = []
        rollouts = []
        for agent_id in range(num_agents):
            ac = Policy(
                envs.observation_space,
                envs.action_space[agent_id],
                num_agents=agent_id,  # here is special
                base_kwargs={
                    'naive_recurrent': args.naive_recurrent_policy,
                    'recurrent': args.recurrent_policy,
                    'hidden_size': args.hidden_size,
                    'attn': args.attn,
                    'attn_size': args.attn_size,
                    'attn_N': args.attn_N,
                    'attn_heads': args.attn_heads,
                    'dropout': args.dropout,
                    'use_average_pool': args.use_average_pool,
                    'use_common_layer': args.use_common_layer,
                    'use_feature_normlization': args.use_feature_normlization,
                    'use_feature_popart': args.use_feature_popart,
                    'use_orthogonal': args.use_orthogonal,
                    'layer_N': args.layer_N,
                    'use_ReLU': args.use_ReLU
                },
                device=device)
            ac.to(device)
            # algorithm
            agent = PPO(ac,
                        args.clip_param,
                        args.ppo_epoch,
                        args.num_mini_batch,
                        args.data_chunk_length,
                        args.value_loss_coef,
                        args.entropy_coef,
                        logger,
                        lr=args.lr,
                        eps=args.eps,
                        weight_decay=args.weight_decay,
                        max_grad_norm=args.max_grad_norm,
                        use_max_grad_norm=args.use_max_grad_norm,
                        use_clipped_value_loss=args.use_clipped_value_loss,
                        use_common_layer=args.use_common_layer,
                        use_huber_loss=args.use_huber_loss,
                        huber_delta=args.huber_delta,
                        use_popart=args.use_popart,
                        device=device)

            actor_critic.append(ac)
            agents.append(agent)

            #replay buffer
            ro = SingleRolloutStorage(agent_id, args.episode_length,
                                      args.n_rollout_threads,
                                      envs.observation_space,
                                      envs.action_space, args.hidden_size)
            rollouts.append(ro)

    # reset env
    obs, _ = envs.reset(num_agents)

    # replay buffer
    if args.share_policy:
        share_obs = obs.reshape(args.n_rollout_threads, -1)
        # share_obs = np.expand_dims(share_obs,1).repeat(num_agents,axis=1)
        rollouts.share_obs[0] = share_obs.copy()
        rollouts.obs[0] = obs.copy()
        rollouts.recurrent_hidden_states = np.zeros(
            rollouts.recurrent_hidden_states.shape).astype(np.float32)
        rollouts.recurrent_hidden_states_critic = np.zeros(
            rollouts.recurrent_hidden_states_critic.shape).astype(np.float32)
    else:

        share_obs = []
        for o in obs:
            share_obs.append(list(itertools.chain(*o)))
        share_obs = np.array(share_obs)
        for agent_id in range(num_agents):
            rollouts[agent_id].share_obs[0] = share_obs.copy()
            rollouts[agent_id].obs[0] = np.array(list(obs[:, agent_id])).copy()
            rollouts[agent_id].recurrent_hidden_states = np.zeros(
                rollouts[agent_id].recurrent_hidden_states.shape).astype(
                    np.float32)
            rollouts[agent_id].recurrent_hidden_states_critic = np.zeros(
                rollouts[agent_id].recurrent_hidden_states_critic.shape
            ).astype(np.float32)

    # run
    start = time.time()
    episodes = int(
        args.num_env_steps) // args.episode_length // args.n_rollout_threads
    timesteps = 0

    for episode in range(episodes):
        if args.use_linear_lr_decay:  # decrease learning rate linearly
            if args.share_policy:
                update_linear_schedule(agents.optimizer, episode, episodes,
                                       args.lr)
            else:
                for agent_id in range(num_agents):
                    update_linear_schedule(agents[agent_id].optimizer, episode,
                                           episodes, args.lr)

        step_cover_rate = np.zeros(shape=(args.n_rollout_threads,
                                          args.episode_length))
        for step in range(args.episode_length):
            # Sample actions
            values = []
            actions = []
            action_log_probs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []

            with torch.no_grad():
                for agent_id in range(num_agents):
                    if args.share_policy:
                        actor_critic.eval()
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic.act(
                            agent_id,
                            # torch.FloatTensor(rollouts.share_obs[step,:,agent_id]),
                            torch.FloatTensor(rollouts.share_obs[step]),
                            torch.FloatTensor(rollouts.obs[step, :, agent_id]),
                            torch.FloatTensor(
                                rollouts.recurrent_hidden_states[step, :,
                                                                 agent_id]),
                            torch.FloatTensor(
                                rollouts.recurrent_hidden_states_critic[
                                    step, :, agent_id]),
                            torch.FloatTensor(rollouts.masks[step, :,
                                                             agent_id]))
                    else:
                        actor_critic[agent_id].eval()
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic[
                            agent_id].act(
                                agent_id,
                                torch.FloatTensor(
                                    rollouts[agent_id].share_obs[step, :]),
                                torch.FloatTensor(
                                    rollouts[agent_id].obs[step, :]),
                                torch.FloatTensor(
                                    rollouts[agent_id].recurrent_hidden_states[
                                        step, :]),
                                torch.FloatTensor(
                                    rollouts[agent_id].
                                    recurrent_hidden_states_critic[step, :]),
                                torch.FloatTensor(
                                    rollouts[agent_id].masks[step, :]))

                    values.append(value.detach().cpu().numpy())
                    actions.append(action.detach().cpu().numpy())
                    action_log_probs.append(
                        action_log_prob.detach().cpu().numpy())
                    recurrent_hidden_statess.append(
                        recurrent_hidden_states.detach().cpu().numpy())
                    recurrent_hidden_statess_critic.append(
                        recurrent_hidden_states_critic.detach().cpu().numpy())

            # rearrange action
            actions_env = []
            for i in range(args.n_rollout_threads):
                one_hot_action_env = []
                for agent_id in range(num_agents):
                    if envs.action_space[
                            agent_id].__class__.__name__ == 'MultiDiscrete':
                        uc_action = []
                        for j in range(envs.action_space[agent_id].shape):
                            uc_one_hot_action = np.zeros(
                                envs.action_space[agent_id].high[j] + 1)
                            uc_one_hot_action[actions[agent_id][i][j]] = 1
                            uc_action.append(uc_one_hot_action)
                        uc_action = np.concatenate(uc_action)
                        one_hot_action_env.append(uc_action)

                    elif envs.action_space[
                            agent_id].__class__.__name__ == 'Discrete':
                        one_hot_action = np.zeros(
                            envs.action_space[agent_id].n)
                        one_hot_action[actions[agent_id][i]] = 1
                        one_hot_action_env.append(one_hot_action)
                    else:
                        raise NotImplementedError
                actions_env.append(one_hot_action_env)

            # Obser reward and next obs
            obs, rewards, dones, infos, _ = envs.step(actions_env,
                                                      args.n_rollout_threads,
                                                      num_agents)
            step_cover_rate[:, step] = np.array(infos)[:, 0]

            # If done then clean the history of observations.
            # insert data in buffer
            masks = []
            for i, done in enumerate(dones):
                mask = []
                for agent_id in range(num_agents):
                    if done[agent_id]:
                        recurrent_hidden_statess[agent_id][i] = np.zeros(
                            args.hidden_size).astype(np.float32)
                        recurrent_hidden_statess_critic[agent_id][
                            i] = np.zeros(args.hidden_size).astype(np.float32)
                        mask.append([0.0])
                    else:
                        mask.append([1.0])
                masks.append(mask)

            if args.share_policy:
                share_obs = obs.reshape(args.n_rollout_threads, -1)
                # share_obs = np.expand_dims(share_obs,1).repeat(num_agents,axis=1)

                rollouts.insert(
                    share_obs, obs,
                    np.array(recurrent_hidden_statess).transpose(1, 0, 2),
                    np.array(recurrent_hidden_statess_critic).transpose(
                        1, 0, 2),
                    np.array(actions).transpose(1, 0, 2),
                    np.array(action_log_probs).transpose(1, 0, 2),
                    np.array(values).transpose(1, 0, 2), rewards, masks)
            else:
                share_obs = []
                for o in obs:
                    share_obs.append(list(itertools.chain(*o)))
                share_obs = np.array(share_obs)
                for agent_id in range(num_agents):
                    rollouts[agent_id].insert(
                        share_obs, np.array(list(obs[:, agent_id])),
                        np.array(recurrent_hidden_statess[agent_id]),
                        np.array(recurrent_hidden_statess_critic[agent_id]),
                        np.array(actions[agent_id]),
                        np.array(action_log_probs[agent_id]),
                        np.array(values[agent_id]), rewards[:, agent_id],
                        np.array(masks)[:, agent_id])
        # import pdb;pdb.set_trace()
        logger.add_scalars(
            'agent/cover_rate_1step',
            {'cover_rate_1step': np.mean(step_cover_rate[:, -1])},
            (episode + 1) * args.episode_length * args.n_rollout_threads)
        logger.add_scalars('agent/cover_rate_5step', {
            'cover_rate_5step':
            np.mean(np.mean(step_cover_rate[:, -5:], axis=1))
        }, (episode + 1) * args.episode_length * args.n_rollout_threads)

        with torch.no_grad():
            for agent_id in range(num_agents):
                if args.share_policy:
                    actor_critic.eval()
                    next_value, _, _ = actor_critic.get_value(
                        agent_id,
                        #    torch.FloatTensor(rollouts.share_obs[-1,:,agent_id]),
                        torch.FloatTensor(rollouts.share_obs[-1]),
                        torch.FloatTensor(rollouts.obs[-1, :, agent_id]),
                        torch.FloatTensor(
                            rollouts.recurrent_hidden_states[-1, :, agent_id]),
                        torch.FloatTensor(
                            rollouts.recurrent_hidden_states_critic[-1, :,
                                                                    agent_id]),
                        torch.FloatTensor(rollouts.masks[-1, :, agent_id]))
                    next_value = next_value.detach().cpu().numpy()
                    rollouts.compute_returns(agent_id, next_value,
                                             args.use_gae, args.gamma,
                                             args.gae_lambda,
                                             args.use_proper_time_limits,
                                             args.use_popart,
                                             agents.value_normalizer)
                else:
                    actor_critic[agent_id].eval()
                    next_value, _, _ = actor_critic[agent_id].get_value(
                        agent_id,
                        torch.FloatTensor(rollouts[agent_id].share_obs[-1, :]),
                        torch.FloatTensor(rollouts[agent_id].obs[-1, :]),
                        torch.FloatTensor(
                            rollouts[agent_id].recurrent_hidden_states[-1, :]),
                        torch.FloatTensor(
                            rollouts[agent_id].recurrent_hidden_states_critic[
                                -1, :]),
                        torch.FloatTensor(rollouts[agent_id].masks[-1, :]))
                    next_value = next_value.detach().cpu().numpy()
                    rollouts[agent_id].compute_returns(
                        next_value, args.use_gae, args.gamma, args.gae_lambda,
                        args.use_proper_time_limits, args.use_popart,
                        agents[agent_id].value_normalizer)

        # update the network
        if args.share_policy:
            actor_critic.train()
            value_loss, action_loss, dist_entropy = agents.update_share(
                num_agents, rollouts)

            rew = []
            for i in range(rollouts.rewards.shape[1]):
                rew.append(np.sum(rollouts.rewards[:, i]))
            logger.add_scalars('average_episode_reward',
                               {'average_episode_reward': np.mean(rew)},
                               (episode + 1) * args.episode_length *
                               args.n_rollout_threads)
            # clean the buffer and reset
            rollouts.after_update()
        else:
            value_losses = []
            action_losses = []
            dist_entropies = []

            for agent_id in range(num_agents):
                actor_critic[agent_id].train()
                value_loss, action_loss, dist_entropy = agents[
                    agent_id].update_single(agent_id, rollouts[agent_id])
                value_losses.append(value_loss)
                action_losses.append(action_loss)
                dist_entropies.append(dist_entropy)

                rew = []
                for i in range(rollouts[agent_id].rewards.shape[1]):
                    rew.append(np.sum(rollouts[agent_id].rewards[:, i]))
                logger.add_scalars('agent%i/average_episode_reward' % agent_id,
                                   {'average_episode_reward': np.mean(rew)},
                                   (episode + 1) * args.episode_length *
                                   args.n_rollout_threads)

                rollouts[agent_id].after_update()

        obs, _ = envs.reset(num_agents)
        # replay buffer
        if args.share_policy:
            share_obs = obs.reshape(args.n_rollout_threads, -1)
            # share_obs = np.expand_dims(share_obs,1).repeat(num_agents,axis=1)
            rollouts.share_obs[0] = share_obs.copy()
            rollouts.obs[0] = obs.copy()
            rollouts.recurrent_hidden_states = np.zeros(
                rollouts.recurrent_hidden_states.shape).astype(np.float32)
            rollouts.recurrent_hidden_states_critic = np.zeros(
                rollouts.recurrent_hidden_states_critic.shape).astype(
                    np.float32)
        else:

            share_obs = []
            for o in obs:
                share_obs.append(list(itertools.chain(*o)))
            share_obs = np.array(share_obs)
            for agent_id in range(num_agents):
                rollouts[agent_id].share_obs[0] = share_obs.copy()
                rollouts[agent_id].obs[0] = np.array(list(
                    obs[:, agent_id])).copy()
                rollouts[agent_id].recurrent_hidden_states = np.zeros(
                    rollouts[agent_id].recurrent_hidden_states.shape).astype(
                        np.float32)
                rollouts[agent_id].recurrent_hidden_states_critic = np.zeros(
                    rollouts[agent_id].recurrent_hidden_states_critic.shape
                ).astype(np.float32)

        total_num_steps = (episode +
                           1) * args.episode_length * args.n_rollout_threads

        if (episode % args.save_interval == 0 or episode == episodes -
                1):  # save for every interval-th episode or for the last epoch
            if args.share_policy:
                torch.save({'model': actor_critic},
                           str(save_dir) + "/agent_model.pt")
            else:
                for agent_id in range(num_agents):
                    torch.save({'model': actor_critic[agent_id]},
                               str(save_dir) + "/agent%i_model" % agent_id +
                               ".pt")

        # log information
        if episode % args.log_interval == 0:
            end = time.time()
            print(
                "\n Scenario {} Algo {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(args.scenario_name, args.algorithm_name, episode,
                        episodes, total_num_steps, args.num_env_steps,
                        int(total_num_steps / (end - start))))
            if args.share_policy:
                print("value loss of agent: " + str(value_loss))
            else:
                for agent_id in range(num_agents):
                    print("value loss of agent%i: " % agent_id +
                          str(value_losses[agent_id]))

            # if args.env_name == "MPE":
            #     for agent_id in range(num_agents):
            #         show_rewards = []
            #         for info in infos:
            #             if 'individual_reward' in info[agent_id].keys():
            #                 show_rewards.append(info[agent_id]['individual_reward'])
            #         logger.add_scalars('agent%i/individual_reward' % agent_id, {'individual_reward': np.mean(show_rewards)}, total_num_steps)

    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    envs.close()
Exemple #5
0
class Trainer:
    def __init__(self, args):
        if args.seed is not None:
            torch.manual_seed(args.seed)
            np.random.seed(args.seed)
            random.seed(args.seed)

        self.env_name = args.environment
        self.env_setting = get_env_setting(self.env_name)
        self.solved_reward = self.env_setting["solved_reward"]
        self.update_timestep = self.env_setting["update_timestep"]
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.env = gym.make(args.environment)
        self.alg = PPO(args, self.device)
        self.log_interval = 5  # print avg reward in the interval
        self.max_episodes = 100000
        self.render = False

    def train(self):
        # logging variables
        running_reward = 0
        avg_length = 0
        time_step = 0
        memory = Memory()
        # self.alg.load_dict("./", self.env_name, self.alg_name, self.net_name)

        # training loop
        time_step = 0
        for i_episode in range(1, self.max_episodes + 1):
            self.alg.reset_memory()
            obs = self.env.reset(render_mode="logic")
            t = 0
            while True:
                t += 1
                # Running policy_old:
                action = self.alg.take_action(obs, memory)
                self.env.render()
                obs, reward, done, _ = self.env.step(action,
                                                     observation_mode="logic")

                # Saving reward and is_terminal:
                memory.rewards.append(reward)
                memory.is_terminals.append(done)

                running_reward += reward
                if self.render:
                    self.env.render()
                if done:
                    break
            time_step += t

            # update if its time
            if time_step >= self.update_timestep and done == True:
                self.alg.update(memory)
                memory.clear_memory()
                time_step = 0

            avg_length += t

            # save every 500 episodes
            if i_episode % 500 == 0:
                directory = "./epoch_performance"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                self.alg.save_dict(directory, f'{self.env_name}_{i_episode}')

            # logging
            if i_episode % self.log_interval == 0:
                avg_length = int(avg_length / self.log_interval)
                running_reward = int((running_reward / self.log_interval))
                print('Episode {} \t avg length: {} \t reward: {}'.format(
                    i_episode, avg_length, running_reward))

                # stop training if avg_reward > solved_reward or reaches the limit of training epoches
                if running_reward > (self.log_interval * self.solved_reward):
                    print("########## Solved! ##########")
                    directory = "./success/"
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    self.alg.save_dict(directory,
                                       f'{self.env_name}_{self.log_interval}')
                    break

                running_reward = 0
                avg_length = 0
def main():
    args = get_config()

    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(args.n_training_threads)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # path
    model_dir = Path('./results') / args.env_name / args.algorithm_name / (
        "run" + str(args.seed))
    if args.critic_full_obs:
        run_dir = model_dir / 'adaptive'
    else:
        run_dir = model_dir / 'adaptive_only'
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir))

    print(
        "\n Now we have %i fixed policy! Train Single Adaptive Policy... \n" %
        args.num_policy_candidates)
    args.env_name = args.env_name + "Adaptive"
    policy_candidates = []
    for i in range(args.num_policy_candidates):
        ac = torch.load(
            str(model_dir) + ("/models/Policy%i" % (i + 1)) +
            "-agent0_model.pt")['model'].cpu()
        policy_candidates.append(ac)

    # env
    envs = make_parallel_env(args, policy_candidates)

    #Policy network
    # agent 0
    actor_critic = Policy(envs.observation_space[0],
                          envs.action_space[0],
                          num_agents=args.num_agents,
                          base_kwargs={
                              'lstm': args.lstm,
                              'naive_recurrent': args.naive_recurrent_policy,
                              'recurrent': args.recurrent_policy,
                              'hidden_size': args.hidden_size
                          })

    actor_critic.to(device)
    agent0 = PPO(actor_critic,
                 0,
                 args.clip_param,
                 args.ppo_epoch,
                 args.num_mini_batch,
                 args.data_chunk_length,
                 args.value_loss_coef,
                 args.entropy_coef,
                 logger,
                 lr=args.lr,
                 eps=args.eps,
                 max_grad_norm=args.max_grad_norm,
                 use_clipped_value_loss=args.use_clipped_value_loss)
    #replay buffer
    rollout = RolloutStorage(args.num_agents, 0, args.episode_length,
                             args.n_rollout_threads, envs.observation_space[0],
                             envs.action_space[0],
                             actor_critic.recurrent_hidden_state_size)

    # reset
    if args.critic_full_obs:
        obs, obs_critic, select_opponent = envs.reset()
    else:
        obs, select_opponent = envs.reset()

    # rollout
    if len(envs.observation_space[0]) == 1:
        if args.critic_full_obs:
            rollout.share_obs[0].copy_(
                torch.tensor(obs_critic.reshape(args.n_rollout_threads, -1)))
        else:
            rollout.share_obs[0].copy_(
                torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
        rollout.obs[0].copy_(torch.tensor(obs[:, 0, :]))
        rollout.recurrent_hidden_states.zero_()
        rollout.recurrent_hidden_states_critic.zero_()
        rollout.recurrent_c_states.zero_()
        rollout.recurrent_c_states_critic.zero_()
    else:
        raise NotImplementedError
    rollout.to(device)

    # run
    collective_return = []
    apple_consumption = []
    waste_cleared = []
    sustainability = []
    fire = []

    start = time.time()
    episodes = int(
        args.num_env_steps) // args.episode_length // args.n_rollout_threads
    all_episode = 0
    all_episode_adaptive = np.zeros(args.num_policy_candidates)

    for episode in range(episodes):
        if args.use_linear_lr_decay:
            update_linear_schedule(agent0.optimizer, episode, episodes,
                                   args.lr)

        for step in range(args.episode_length):
            with torch.no_grad():
                value, action0, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic, recurrent_c_states, recurrent_c_states_critic = actor_critic.act(
                    rollout.share_obs[step], rollout.obs[step],
                    rollout.recurrent_hidden_states[step],
                    rollout.recurrent_hidden_states_critic[step],
                    rollout.recurrent_c_states[step],
                    rollout.recurrent_c_states_critic[step],
                    rollout.masks[step])

            # rearrange action
            actions_env = []
            for i in range(args.n_rollout_threads):
                one_hot_action = np.zeros((1, envs.action_space[0].n))
                one_hot_action[0][action0[i]] = 1
                actions_env.append(one_hot_action)

            # Obser reward and next obs
            if args.critic_full_obs:
                obs, obs_critic, select_opponent, reward, done, infos = envs.step(
                    actions_env)
            else:
                obs, select_opponent, reward, done, infos = envs.step(
                    actions_env)

            # If done then clean the history of observations.
            # insert data in buffer
            masks = []
            bad_masks = []
            for i in range(args.num_agents):
                mask = []
                bad_mask = []
                for done_ in done:
                    if done_[i]:
                        mask.append([0.0])
                        bad_mask.append([1.0])
                    else:
                        mask.append([1.0])
                        bad_mask.append([1.0])
                masks.append(torch.FloatTensor(mask))
                bad_masks.append(torch.FloatTensor(bad_mask))

            if len(envs.observation_space[0]) == 1:
                if args.critic_full_obs:
                    rollout.insert(
                        torch.tensor(
                            obs_critic.reshape(args.n_rollout_threads, -1)),
                        torch.tensor(obs[:, 0, :]), recurrent_hidden_states,
                        recurrent_hidden_states_critic, recurrent_c_states,
                        recurrent_c_states_critic, action0,
                        action_log_prob, value,
                        torch.tensor(reward[:, 0].reshape(-1, 1)), masks[0],
                        bad_masks[0])
                else:
                    rollout.insert(
                        torch.tensor(obs.reshape(args.n_rollout_threads, -1)),
                        torch.tensor(obs[:, 0, :]), recurrent_hidden_states,
                        recurrent_hidden_states_critic, recurrent_c_states,
                        recurrent_c_states_critic, action0,
                        action_log_prob, value,
                        torch.tensor(reward[:, 0].reshape(-1, 1)), masks[0],
                        bad_masks[0])
            else:
                raise NotImplementedError

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollout.share_obs[-1], rollout.obs[-1],
                rollout.recurrent_hidden_states[-1],
                rollout.recurrent_hidden_states_critic[-1],
                rollout.recurrent_c_states[-1],
                rollout.recurrent_c_states_critic[-1],
                rollout.masks[-1]).detach()

        rollout.compute_returns(next_value, args.use_gae, args.gamma,
                                args.gae_lambda, args.use_proper_time_limits)

        # update the network
        value_loss, action_loss, dist_entropy = agent0.update(rollout)

        if args.env_name == "StagHuntAdaptive":
            coop_num = []
            defect_num = []
            coopdefect_num = []
            defectcoop_num = []
            rew = []
            for info in infos:
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'defect&defect_num' in info.keys():
                    defect_num.append(info['defect&defect_num'])
                if 'coop&defect_num' in info.keys():
                    coopdefect_num.append(info['coop&defect_num'])
                if 'defect&coop_num' in info.keys():
                    defectcoop_num.append(info['defect&coop_num'])
            for i in range(args.n_rollout_threads):
                rew.append(rollout.rewards[:, i, :].sum().cpu().numpy())

            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/defect&defect_num_per_episode',
                    {'defect&defect_num_per_episode': defect_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/coop&defect_num_per_episode',
                    {'coop&defect_num_per_episode': coopdefect_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/defect&coop_num_per_episode',
                    {'defect&coop_num_per_episode': defectcoop_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) + '/reward',
                    {'reward': np.mean(np.array(rew[i]))},
                    all_episode_adaptive[select_opponent[i]])
                all_episode_adaptive[select_opponent[i]] += 1
        elif args.env_name == "StagHuntGWAdaptive":
            collective_return = []
            coop_num = []
            gore1_num = []
            gore2_num = []
            hare1_num = []
            hare2_num = []
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'gore1_num' in info.keys():
                    gore1_num.append(info['gore1_num'])
                if 'gore2_num' in info.keys():
                    gore2_num.append(info['gore2_num'])
                if 'hare1_num' in info.keys():
                    hare1_num.append(info['hare1_num'])
                if 'hare2_num' in info.keys():
                    hare2_num.append(info['hare2_num'])

            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/collective_return',
                    {'collective_return': collective_return[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/gore1_num_per_episode',
                    {'gore1_num_per_episode': gore1_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/gore2_num_per_episode',
                    {'gore2_num_per_episode': gore2_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/hare1_num_per_episode',
                    {'hare1_num_per_episode': hare1_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                logger.add_scalars(
                    'Policy-' + str(select_opponent[i] + 1) +
                    '/hare2_num_per_episode',
                    {'hare2_num_per_episode': hare2_num[i]},
                    all_episode_adaptive[select_opponent[i]])
                all_episode_adaptive[select_opponent[i]] += 1

        if args.critic_full_obs:
            obs, obs_critic, select_opponent = envs.reset()
        else:
            obs, select_opponent = envs.reset()

        if len(envs.observation_space[0]) == 1:
            if args.critic_full_obs:
                rollout.share_obs[0].copy_(
                    torch.tensor(obs_critic.reshape(args.n_rollout_threads,
                                                    -1)))
            else:
                rollout.share_obs[0].copy_(
                    torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
            rollout.obs[0].copy_(torch.tensor(obs[:, 0, :]))
            rollout.recurrent_hidden_states.zero_()
            rollout.recurrent_hidden_states_critic.zero_()
            rollout.recurrent_c_states.zero_()
            rollout.recurrent_c_states_critic.zero_()
            rollout.masks[0].copy_(torch.ones(args.n_rollout_threads, 1))
            rollout.bad_masks[0].copy_(torch.ones(args.n_rollout_threads, 1))
        else:
            raise NotImplementedError
        rollout.to(device)

        if (episode % args.save_interval == 0 or episode == episodes - 1):
            torch.save({'model': actor_critic},
                       str(save_dir) + "/agent0_model.pt")

        # log information
        if episode % args.log_interval == 0:
            total_num_steps = (
                episode + 1) * args.episode_length * args.n_rollout_threads
            end = time.time()
            print(
                "\n Updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(episode, episodes, total_num_steps, args.num_env_steps,
                        int(total_num_steps / (end - start))))
            print("value loss: agent0--" + str(value_loss))
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Exemple #7
0
def main():
    args = get_config()
    run = wandb.init(project='curriculum',name=str(args.algorithm_name) + "_seed" + str(args.seed))
    # run = wandb.init(project='check',name='separate_reward')
    
    assert (args.share_policy == True and args.scenario_name == 'simple_speaker_listener') == False, ("The simple_speaker_listener scenario can not use shared policy. Please check the config.py.")

    # seed
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    
    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(1)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)
    
    # path
    model_dir = Path('./results') / args.env_name / args.scenario_name / args.algorithm_name
    node_dir = Path('./node') / args.env_name / args.scenario_name / args.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    if not node_dir.exists():
        node_curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in node_dir.iterdir() if str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            node_curr_run = 'run1'
        else:
            node_curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    save_node_dir = node_dir / node_curr_run
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir)) 

    # env
    envs = make_parallel_env(args)
    num_agents = args.num_agents
    #Policy network
    if args.share_policy:
        actor_base = ATTBase_actor_dist_add(envs.observation_space[0].shape[0], envs.action_space[0], num_agents)
        critic_base = ATTBase_critic_add(envs.observation_space[0].shape[0], num_agents)
        actor_critic = Policy3(envs.observation_space[0], 
                    envs.action_space[0],
                    num_agents = num_agents,
                    base=None,
                    actor_base=actor_base,
                    critic_base=critic_base,
                    base_kwargs={'naive_recurrent': args.naive_recurrent_policy,
                                 'recurrent': args.recurrent_policy,
                                 'hidden_size': args.hidden_size,
                                 'attn': args.attn,                                 
                                 'attn_size': args.attn_size,
                                 'attn_N': args.attn_N,
                                 'attn_heads': args.attn_heads,
                                 'dropout': args.dropout,
                                 'use_average_pool': args.use_average_pool,
                                 'use_common_layer':args.use_common_layer,
                                 'use_feature_normlization':args.use_feature_normlization,
                                 'use_feature_popart':args.use_feature_popart,
                                 'use_orthogonal':args.use_orthogonal,
                                 'layer_N':args.layer_N,
                                 'use_ReLU':args.use_ReLU
                                 },
                    device = device)
        actor_critic.to(device)
        # algorithm
        agents = PPO3(actor_critic,
                   args.clip_param,
                   args.ppo_epoch,
                   args.num_mini_batch,
                   args.data_chunk_length,
                   args.value_loss_coef,
                   args.entropy_coef,
                   logger,
                   lr=args.lr,
                   eps=args.eps,
                   weight_decay=args.weight_decay,
                   max_grad_norm=args.max_grad_norm,
                   use_max_grad_norm=args.use_max_grad_norm,
                   use_clipped_value_loss= args.use_clipped_value_loss,
                   use_common_layer=args.use_common_layer,
                   use_huber_loss=args.use_huber_loss,
                   huber_delta=args.huber_delta,
                   use_popart=args.use_popart,
                   device=device)
                   
        # #replay buffer
        # rollouts = RolloutStorage(num_agents,
        #             args.episode_length, 
        #             args.n_rollout_threads,
        #             envs.observation_space[0], 
        #             envs.action_space[0],
        #             args.hidden_size)        
    else:
        actor_critic = []
        agents = []
        rollouts = []
        for agent_id in range(num_agents):
            ac = Policy(envs.observation_space, 
                      envs.action_space[agent_id],
                      num_agents = agent_id, # here is special
                      base_kwargs={'naive_recurrent': args.naive_recurrent_policy,
                                 'recurrent': args.recurrent_policy,
                                 'hidden_size': args.hidden_size,
                                 'attn': args.attn,                                 
                                 'attn_size': args.attn_size,
                                 'attn_N': args.attn_N,
                                 'attn_heads': args.attn_heads,
                                 'dropout': args.dropout,
                                 'use_average_pool': args.use_average_pool,
                                 'use_common_layer':args.use_common_layer,
                                 'use_feature_normlization':args.use_feature_normlization,
                                 'use_feature_popart':args.use_feature_popart,
                                 'use_orthogonal':args.use_orthogonal,
                                 'layer_N':args.layer_N,
                                 'use_ReLU':args.use_ReLU
                                 },
                      device = device)
            ac.to(device)
            # algorithm
            agent = PPO(ac,
                   args.clip_param,
                   args.ppo_epoch,
                   args.num_mini_batch,
                   args.data_chunk_length,
                   args.value_loss_coef,
                   args.entropy_coef,
                   logger,
                   lr=args.lr,
                   eps=args.eps,
                   weight_decay=args.weight_decay,
                   max_grad_norm=args.max_grad_norm,
                   use_max_grad_norm=args.use_max_grad_norm,
                   use_clipped_value_loss= args.use_clipped_value_loss,
                   use_common_layer=args.use_common_layer,
                   use_huber_loss=args.use_huber_loss,
                   huber_delta=args.huber_delta,
                   use_popart=args.use_popart,
                   device=device)
                               
            actor_critic.append(ac)
            agents.append(agent) 
              
            #replay buffer
            ro = SingleRolloutStorage(agent_id,
                    args.episode_length, 
                    args.n_rollout_threads,
                    envs.observation_space, 
                    envs.action_space,
                    args.hidden_size)
            rollouts.append(ro)
    
   
    boundary = 3
    start_boundary = [-0.3,0.3,-0.3,0.3] # 分别代表x的范围和y的范围
    # start_boundary = [2.4,3.0,2.4,3.0]
    max_step = 0.6
    N_easy = 0
    test_flag = 0
    reproduce_flag = 0
    target_num = 4
    last_agent_num = 4
    now_agent_num = num_agents
    mean_cover_rate = 0
    eval_frequency = 2 #需要fix几个回合
    check_frequency = 1
    save_node_frequency = 5
    save_node_flag = True
    save_90_flag = True
    historical_length = 5
    random.seed(args.seed)
    np.random.seed(args.seed)


    # init the Gan
    gan_configs['goal_range'] = boundary
    gan_configs['goal_center'] = np.zeros((num_agents + num_agents)* 2, dtype=float)
    gan_configs['goal_size'] = (num_agents + num_agents)*2
    gan = StateGAN(gan_configs = gan_configs, state_range=gan_configs['goal_range'])
    feasible_goals = generate_initial_goals(num_case = 10000, start_boundary = start_boundary, agent_num = args.num_agents)                            
    dis_loss, gen_loss = gan.pretrain(states=feasible_goals, outer_iters=gan_configs['gan_outer_iters'])
    print('discriminator_loss:',str(dis_loss.cpu()), 'generator_loss:',str(gen_loss.cpu()))
    
    # init the StateCollection
    all_goals = StateCollection(distance_threshold=goal_configs['coll_eps'])

    # run
    begin = time.time()
    episodes = int(args.num_env_steps) // args.episode_length // args.n_rollout_threads // eval_frequency
    curriculum_episode = 0
    current_timestep = 0
    one_length = args.n_rollout_threads
    starts_length = args.n_rollout_threads
    num_envs = args.n_rollout_threads

    for episode in range(episodes):
        if args.use_linear_lr_decay:# decrease learning rate linearly
            if args.share_policy:   
                update_linear_schedule(agents.optimizer, episode, episodes, args.lr)  
            else:     
                for agent_id in range(num_agents):
                    update_linear_schedule(agents[agent_id].optimizer, episode, episodes, args.lr)           



        raw_goals, _ = gan.sample_states_with_noise(goal_configs['num_new_goals'])
        # replay buffer
        if all_goals.size > 0:
            old_goals = all_goals.sample(goal_configs['num_old_goals'])
            goals = np.vstack([raw_goals, old_goals])
        else:
            goals = raw_goals   
        if goals.shape[0] < num_envs:
            add_num = num_envs - goals.shape[0]
            goals = np.vstack([goals, goals[:add_num]]) #补齐到num_new_goals+num_old_goals   
        # generate the starts
        starts = numpy_to_list(goals, list_length=num_envs, shape=(num_agents*2,2))

        for times in range(eval_frequency):
            obs = envs.new_starts_obs(starts, now_agent_num, starts_length)
            #replay buffer
            rollouts = RolloutStorage(num_agents,
                        args.episode_length, 
                        starts_length,
                        envs.observation_space[0], 
                        envs.action_space[0],
                        args.hidden_size) 
            # replay buffer init
            if args.share_policy: 
                share_obs = obs.reshape(starts_length, -1)        
                share_obs = np.expand_dims(share_obs,1).repeat(num_agents,axis=1)    
                rollouts.share_obs[0] = share_obs.copy() 
                rollouts.obs[0] = obs.copy()               
                rollouts.recurrent_hidden_states = np.zeros(rollouts.recurrent_hidden_states.shape).astype(np.float32)
                rollouts.recurrent_hidden_states_critic = np.zeros(rollouts.recurrent_hidden_states_critic.shape).astype(np.float32)
            else:
                share_obs = []
                for o in obs:
                    share_obs.append(list(itertools.chain(*o)))
                share_obs = np.array(share_obs)
                for agent_id in range(num_agents):    
                    rollouts[agent_id].share_obs[0] = share_obs.copy()
                    rollouts[agent_id].obs[0] = np.array(list(obs[:,agent_id])).copy()               
                    rollouts[agent_id].recurrent_hidden_states = np.zeros(rollouts[agent_id].recurrent_hidden_states.shape).astype(np.float32)
                    rollouts[agent_id].recurrent_hidden_states_critic = np.zeros(rollouts[agent_id].recurrent_hidden_states_critic.shape).astype(np.float32)
            step_cover_rate = np.zeros(shape=(one_length,args.episode_length))
            for step in range(args.episode_length):
                # Sample actions
                values = []
                actions= []
                action_log_probs = []
                recurrent_hidden_statess = []
                recurrent_hidden_statess_critic = []
                
                with torch.no_grad():                
                    for agent_id in range(num_agents):
                        if args.share_policy:
                            actor_critic.eval()
                            value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic.act(agent_id,
                                torch.FloatTensor(rollouts.share_obs[step,:,agent_id]), 
                                torch.FloatTensor(rollouts.obs[step,:,agent_id]), 
                                torch.FloatTensor(rollouts.recurrent_hidden_states[step,:,agent_id]), 
                                torch.FloatTensor(rollouts.recurrent_hidden_states_critic[step,:,agent_id]),
                                torch.FloatTensor(rollouts.masks[step,:,agent_id]))
                        else:
                            actor_critic[agent_id].eval()
                            value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic[agent_id].act(agent_id,
                                torch.FloatTensor(rollouts[agent_id].share_obs[step,:]), 
                                torch.FloatTensor(rollouts[agent_id].obs[step,:]), 
                                torch.FloatTensor(rollouts[agent_id].recurrent_hidden_states[step,:]), 
                                torch.FloatTensor(rollouts[agent_id].recurrent_hidden_states_critic[step,:]),
                                torch.FloatTensor(rollouts[agent_id].masks[step,:]))
                            
                        values.append(value.detach().cpu().numpy())
                        actions.append(action.detach().cpu().numpy())
                        action_log_probs.append(action_log_prob.detach().cpu().numpy())
                        recurrent_hidden_statess.append(recurrent_hidden_states.detach().cpu().numpy())
                        recurrent_hidden_statess_critic.append(recurrent_hidden_states_critic.detach().cpu().numpy())
                # rearrange action
                actions_env = []
                for i in range(starts_length):
                    one_hot_action_env = []
                    for agent_id in range(num_agents):
                        if envs.action_space[agent_id].__class__.__name__ == 'MultiDiscrete':
                            uc_action = []
                            for j in range(envs.action_space[agent_id].shape):
                                uc_one_hot_action = np.zeros(envs.action_space[agent_id].high[j]+1)
                                uc_one_hot_action[actions[agent_id][i][j]] = 1
                                uc_action.append(uc_one_hot_action)
                            uc_action = np.concatenate(uc_action)
                            one_hot_action_env.append(uc_action)
                                
                        elif envs.action_space[agent_id].__class__.__name__ == 'Discrete':    
                            one_hot_action = np.zeros(envs.action_space[agent_id].n)
                            one_hot_action[actions[agent_id][i]] = 1
                            one_hot_action_env.append(one_hot_action)
                        else:
                            raise NotImplementedError
                    actions_env.append(one_hot_action_env)
                
                # Obser reward and next obs
                obs, rewards, dones, infos, _ = envs.step(actions_env, starts_length, num_agents)
                cover_rate_list = []
                for env_id in range(one_length):
                    cover_rate_list.append(infos[env_id][0]['cover_rate'])
                step_cover_rate[:,step] = np.array(cover_rate_list)
                # step_cover_rate[:,step] = np.array(infos)[0:one_length,0]

                # If done then clean the history of observations.
                # insert data in buffer
                masks = []
                for i, done in enumerate(dones): 
                    mask = []               
                    for agent_id in range(num_agents): 
                        if done[agent_id]:    
                            recurrent_hidden_statess[agent_id][i] = np.zeros(args.hidden_size).astype(np.float32)
                            recurrent_hidden_statess_critic[agent_id][i] = np.zeros(args.hidden_size).astype(np.float32)    
                            mask.append([0.0])
                        else:
                            mask.append([1.0])
                    masks.append(mask)
                                
                if args.share_policy: 
                    share_obs = obs.reshape(starts_length, -1)        
                    share_obs = np.expand_dims(share_obs,1).repeat(num_agents,axis=1)    
                    rollouts.insert(share_obs, 
                                obs, 
                                np.array(recurrent_hidden_statess).transpose(1,0,2), 
                                np.array(recurrent_hidden_statess_critic).transpose(1,0,2), 
                                np.array(actions).transpose(1,0,2),
                                np.array(action_log_probs).transpose(1,0,2), 
                                np.array(values).transpose(1,0,2),
                                rewards, 
                                masks)
                else:
                    share_obs = []
                    for o in obs:
                        share_obs.append(list(itertools.chain(*o)))
                    share_obs = np.array(share_obs)
                    for agent_id in range(num_agents):
                        rollouts[agent_id].insert(share_obs, 
                                np.array(list(obs[:,agent_id])), 
                                np.array(recurrent_hidden_statess[agent_id]), 
                                np.array(recurrent_hidden_statess_critic[agent_id]), 
                                np.array(actions[agent_id]),
                                np.array(action_log_probs[agent_id]), 
                                np.array(values[agent_id]),
                                rewards[:,agent_id], 
                                np.array(masks)[:,agent_id])
            # logger.add_scalars('agent/training_cover_rate',{'training_cover_rate': np.mean(np.mean(step_cover_rate[:,-historical_length:],axis=1))}, current_timestep)
            wandb.log({'training_cover_rate': np.mean(np.mean(step_cover_rate[:,-historical_length:],axis=1))}, current_timestep)
            print('training_cover_rate: ', np.mean(np.mean(step_cover_rate[:,-historical_length:],axis=1)))
            current_timestep += args.episode_length * starts_length
            curriculum_episode += 1
            
            #region train the gan

            
            if times == 1:
                start_time = time.time()
                filtered_raw_goals = []
                labels = np.zeros((num_envs, 1), dtype = int)
                for i in range(num_envs):
                    R_i = np.mean(step_cover_rate[i, -goal_configs['historical_length']:])
                    if R_i < goal_configs['R_max'] and R_i > goal_configs['R_min']:
                        labels[i] = 1
                        filtered_raw_goals.append(goals[i])
                gan.train(goals, labels)
                all_goals.append(filtered_raw_goals)
                end_time = time.time()
                print("Gan training time: %.2f"%(end_time-start_time))

            with torch.no_grad():  # get value and com
                for agent_id in range(num_agents):         
                    if args.share_policy: 
                        actor_critic.eval()                
                        next_value,_,_ = actor_critic.get_value(agent_id,
                                                    torch.FloatTensor(rollouts.share_obs[-1,:,agent_id]), 
                                                    torch.FloatTensor(rollouts.obs[-1,:,agent_id]), 
                                                    torch.FloatTensor(rollouts.recurrent_hidden_states[-1,:,agent_id]),
                                                    torch.FloatTensor(rollouts.recurrent_hidden_states_critic[-1,:,agent_id]),
                                                    torch.FloatTensor(rollouts.masks[-1,:,agent_id]))
                        next_value = next_value.detach().cpu().numpy()
                        rollouts.compute_returns(agent_id,
                                        next_value, 
                                        args.use_gae, 
                                        args.gamma,
                                        args.gae_lambda, 
                                        args.use_proper_time_limits,
                                        args.use_popart,
                                        agents.value_normalizer)
                    else:
                        actor_critic[agent_id].eval()
                        next_value,_,_ = actor_critic[agent_id].get_value(agent_id,
                                                                torch.FloatTensor(rollouts[agent_id].share_obs[-1,:]), 
                                                                torch.FloatTensor(rollouts[agent_id].obs[-1,:]), 
                                                                torch.FloatTensor(rollouts[agent_id].recurrent_hidden_states[-1,:]),
                                                                torch.FloatTensor(rollouts[agent_id].recurrent_hidden_states_critic[-1,:]),
                                                                torch.FloatTensor(rollouts[agent_id].masks[-1,:]))
                        next_value = next_value.detach().cpu().numpy()
                        rollouts[agent_id].compute_returns(next_value, 
                                                args.use_gae, 
                                                args.gamma,
                                                args.gae_lambda, 
                                                args.use_proper_time_limits,
                                                args.use_popart,
                                                agents[agent_id].value_normalizer)

            # update the network
            if args.share_policy:
                actor_critic.train()
                value_loss, action_loss, dist_entropy = agents.update_share_asynchronous(last_agent_num, rollouts, False, initial_optimizer=False) 
                print('value_loss: ', value_loss)
                wandb.log(
                    {'value_loss': value_loss},
                    current_timestep)
                rew = []
                for i in range(rollouts.rewards.shape[1]):
                    rew.append(np.sum(rollouts.rewards[:,i]))
                wandb.log(
                    {'average_episode_reward': np.mean(rew)},
                    current_timestep)
                # clean the buffer and reset
                rollouts.after_update()
            else:
                value_losses = []
                action_losses = []
                dist_entropies = [] 
                
                for agent_id in range(num_agents):
                    actor_critic[agent_id].train()
                    value_loss, action_loss, dist_entropy = agents[agent_id].update_single(agent_id, rollouts[agent_id])
                    value_losses.append(value_loss)
                    action_losses.append(action_loss)
                    dist_entropies.append(dist_entropy)
                        
                    rew = []
                    for i in range(rollouts[agent_id].rewards.shape[1]):
                        rew.append(np.sum(rollouts[agent_id].rewards[:,i]))

                    logger.add_scalars('agent%i/average_episode_reward'%agent_id,
                        {'average_episode_reward': np.mean(rew)},
                        (episode+1) * args.episode_length * one_length*eval_frequency)
                    
                    rollouts[agent_id].after_update()


        # test
        if episode % check_frequency==0:
            obs, _ = envs.reset(num_agents)
            episode_length = 70
            #replay buffer
            rollouts = RolloutStorage(num_agents,
                        episode_length, 
                        args.n_rollout_threads,
                        envs.observation_space[0], 
                        envs.action_space[0],
                        args.hidden_size) 
            # replay buffer init
            if args.share_policy: 
                share_obs = obs.reshape(args.n_rollout_threads, -1)        
                share_obs = np.expand_dims(share_obs,1).repeat(num_agents,axis=1)    
                rollouts.share_obs[0] = share_obs.copy() 
                rollouts.obs[0] = obs.copy()               
                rollouts.recurrent_hidden_states = np.zeros(rollouts.recurrent_hidden_states.shape).astype(np.float32)
                rollouts.recurrent_hidden_states_critic = np.zeros(rollouts.recurrent_hidden_states_critic.shape).astype(np.float32)
            else:
                share_obs = []
                for o in obs:
                    share_obs.append(list(itertools.chain(*o)))
                share_obs = np.array(share_obs)
                for agent_id in range(num_agents):    
                    rollouts[agent_id].share_obs[0] = share_obs.copy()
                    rollouts[agent_id].obs[0] = np.array(list(obs[:,agent_id])).copy()               
                    rollouts[agent_id].recurrent_hidden_states = np.zeros(rollouts[agent_id].recurrent_hidden_states.shape).astype(np.float32)
                    rollouts[agent_id].recurrent_hidden_states_critic = np.zeros(rollouts[agent_id].recurrent_hidden_states_critic.shape).astype(np.float32)
            test_cover_rate = np.zeros(shape=(args.n_rollout_threads,episode_length))
            for step in range(episode_length):
                # Sample actions
                values = []
                actions= []
                action_log_probs = []
                recurrent_hidden_statess = []
                recurrent_hidden_statess_critic = []
                
                with torch.no_grad():                
                    for agent_id in range(num_agents):
                        if args.share_policy:
                            actor_critic.eval()
                            value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic.act(agent_id,
                                torch.FloatTensor(rollouts.share_obs[step,:,agent_id]), 
                                torch.FloatTensor(rollouts.obs[step,:,agent_id]), 
                                torch.FloatTensor(rollouts.recurrent_hidden_states[step,:,agent_id]), 
                                torch.FloatTensor(rollouts.recurrent_hidden_states_critic[step,:,agent_id]),
                                torch.FloatTensor(rollouts.masks[step,:,agent_id]),deterministic=True)
                        else:
                            actor_critic[agent_id].eval()
                            value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic[agent_id].act(agent_id,
                                torch.FloatTensor(rollouts[agent_id].share_obs[step,:]), 
                                torch.FloatTensor(rollouts[agent_id].obs[step,:]), 
                                torch.FloatTensor(rollouts[agent_id].recurrent_hidden_states[step,:]), 
                                torch.FloatTensor(rollouts[agent_id].recurrent_hidden_states_critic[step,:]),
                                torch.FloatTensor(rollouts[agent_id].masks[step,:]),deterministic=True)
                            
                        values.append(value.detach().cpu().numpy())
                        actions.append(action.detach().cpu().numpy())
                        action_log_probs.append(action_log_prob.detach().cpu().numpy())
                        recurrent_hidden_statess.append(recurrent_hidden_states.detach().cpu().numpy())
                        recurrent_hidden_statess_critic.append(recurrent_hidden_states_critic.detach().cpu().numpy())
                
                # rearrange action
                actions_env = []
                for i in range(args.n_rollout_threads):
                    one_hot_action_env = []
                    for agent_id in range(num_agents):
                        if envs.action_space[agent_id].__class__.__name__ == 'MultiDiscrete':
                            uc_action = []
                            for j in range(envs.action_space[agent_id].shape):
                                uc_one_hot_action = np.zeros(envs.action_space[agent_id].high[j]+1)
                                uc_one_hot_action[actions[agent_id][i][j]] = 1
                                uc_action.append(uc_one_hot_action)
                            uc_action = np.concatenate(uc_action)
                            one_hot_action_env.append(uc_action)
                                
                        elif envs.action_space[agent_id].__class__.__name__ == 'Discrete':    
                            one_hot_action = np.zeros(envs.action_space[agent_id].n)
                            one_hot_action[actions[agent_id][i]] = 1
                            one_hot_action_env.append(one_hot_action)
                        else:
                            raise NotImplementedError
                    actions_env.append(one_hot_action_env)
                
                # Obser reward and next obs
                obs, rewards, dones, infos, _ = envs.step(actions_env, args.n_rollout_threads, num_agents)
                cover_rate_list = []
                for env_id in range(args.n_rollout_threads):
                    cover_rate_list.append(infos[env_id][0]['cover_rate'])
                test_cover_rate[:,step] = np.array(cover_rate_list)
                # test_cover_rate[:,step] = np.array(infos)[:,0]

                # If done then clean the history of observations.
                # insert data in buffer
                masks = []
                for i, done in enumerate(dones): 
                    mask = []               
                    for agent_id in range(num_agents): 
                        if done[agent_id]:    
                            recurrent_hidden_statess[agent_id][i] = np.zeros(args.hidden_size).astype(np.float32)
                            recurrent_hidden_statess_critic[agent_id][i] = np.zeros(args.hidden_size).astype(np.float32)    
                            mask.append([0.0])
                        else:
                            mask.append([1.0])
                    masks.append(mask)
                                
                if args.share_policy: 
                    share_obs = obs.reshape(args.n_rollout_threads, -1)        
                    share_obs = np.expand_dims(share_obs,1).repeat(num_agents,axis=1)    
                    
                    rollouts.insert(share_obs, 
                                obs, 
                                np.array(recurrent_hidden_statess).transpose(1,0,2), 
                                np.array(recurrent_hidden_statess_critic).transpose(1,0,2), 
                                np.array(actions).transpose(1,0,2),
                                np.array(action_log_probs).transpose(1,0,2), 
                                np.array(values).transpose(1,0,2),
                                rewards, 
                                masks)
                else:
                    share_obs = []
                    for o in obs:
                        share_obs.append(list(itertools.chain(*o)))
                    share_obs = np.array(share_obs)
                    for agent_id in range(num_agents):
                        rollouts[agent_id].insert(share_obs, 
                                np.array(list(obs[:,agent_id])), 
                                np.array(recurrent_hidden_statess[agent_id]), 
                                np.array(recurrent_hidden_statess_critic[agent_id]), 
                                np.array(actions[agent_id]),
                                np.array(action_log_probs[agent_id]), 
                                np.array(values[agent_id]),
                                rewards[:,agent_id], 
                                np.array(masks)[:,agent_id])

            # logger.add_scalars('agent/cover_rate_1step',{'cover_rate_1step': np.mean(test_cover_rate[:,-1])},current_timestep)
            # logger.add_scalars('agent/cover_rate_5step',{'cover_rate_5step': np.mean(np.mean(test_cover_rate[:,-historical_length:],axis=1))}, current_timestep)
            rew = []
            for i in range(rollouts.rewards.shape[1]):
                rew.append(np.sum(rollouts.rewards[:,i]))
            wandb.log(
                {'eval_episode_reward': np.mean(rew)},
                current_timestep)
            wandb.log({'cover_rate_1step': np.mean(test_cover_rate[:,-1])},current_timestep)
            wandb.log({'cover_rate_5step': np.mean(np.mean(test_cover_rate[:,-historical_length:],axis=1))}, current_timestep)
            mean_cover_rate = np.mean(np.mean(test_cover_rate[:,-historical_length:],axis=1))
            if mean_cover_rate >= 0.9 and args.algorithm_name=='ours' and save_90_flag:
                torch.save({'model': actor_critic}, str(save_dir) + "/cover09_agent_model.pt")
                save_90_flag = False
            print('test_agent_num: ', last_agent_num)
            print('test_mean_cover_rate: ', mean_cover_rate)

        total_num_steps = current_timestep

        if (episode % args.save_interval == 0 or episode == episodes - 1):# save for every interval-th episode or for the last epoch
            if args.share_policy:
                torch.save({
                            'model': actor_critic
                            }, 
                            str(save_dir) + "/agent_model.pt")
            else:
                for agent_id in range(num_agents):                                                  
                    torch.save({
                                'model': actor_critic[agent_id]
                                }, 
                                str(save_dir) + "/agent%i_model" % agent_id + ".pt")

        # log information
        if episode % args.log_interval == 0:
            end = time.time()
            print("\n Scenario {} Algo {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(args.scenario_name,
                        args.algorithm_name,
                        episode, 
                        episodes,
                        total_num_steps,
                        args.num_env_steps,
                        int(total_num_steps / (end - begin))))
            if args.share_policy:
                print("value loss of agent: " + str(value_loss))
            else:
                for agent_id in range(num_agents):
                    print("value loss of agent%i: " % agent_id + str(value_losses[agent_id]))
                
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    envs.close()
def main():
    args = get_config()

    # seed
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(args.n_training_threads)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)

    # path
    model_dir = Path(
        './results') / args.env_name / args.map_name / args.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir))

    # env
    envs = make_parallel_env(args)
    if args.eval:
        eval_env = make_eval_env(args)
    num_agents = get_map_params(args.map_name)["n_agents"]
    #Policy network

    if args.share_policy:
        actor_critic = Policy(envs.observation_space[0],
                              envs.action_space[0],
                              num_agents=num_agents,
                              gain=args.gain,
                              base_kwargs={
                                  'naive_recurrent':
                                  args.naive_recurrent_policy,
                                  'recurrent': args.recurrent_policy,
                                  'hidden_size': args.hidden_size,
                                  'recurrent_N': args.recurrent_N,
                                  'attn': args.attn,
                                  'attn_only_critic': args.attn_only_critic,
                                  'attn_size': args.attn_size,
                                  'attn_N': args.attn_N,
                                  'attn_heads': args.attn_heads,
                                  'dropout': args.dropout,
                                  'use_average_pool': args.use_average_pool,
                                  'use_common_layer': args.use_common_layer,
                                  'use_feature_normlization':
                                  args.use_feature_normlization,
                                  'use_feature_popart':
                                  args.use_feature_popart,
                                  'use_orthogonal': args.use_orthogonal,
                                  'layer_N': args.layer_N,
                                  'use_ReLU': args.use_ReLU,
                                  'use_same_dim': args.use_same_dim
                              },
                              device=device)
        actor_critic.to(device)
        # algorithm
        agents = PPO(actor_critic,
                     args.clip_param,
                     args.ppo_epoch,
                     args.num_mini_batch,
                     args.data_chunk_length,
                     args.value_loss_coef,
                     args.entropy_coef,
                     logger,
                     lr=args.lr,
                     eps=args.eps,
                     weight_decay=args.weight_decay,
                     max_grad_norm=args.max_grad_norm,
                     use_max_grad_norm=args.use_max_grad_norm,
                     use_clipped_value_loss=args.use_clipped_value_loss,
                     use_common_layer=args.use_common_layer,
                     use_huber_loss=args.use_huber_loss,
                     huber_delta=args.huber_delta,
                     use_popart=args.use_popart,
                     use_value_high_masks=args.use_value_high_masks,
                     device=device)

        #replay buffer
        rollouts = RolloutStorage(num_agents, args.episode_length,
                                  args.n_rollout_threads,
                                  envs.observation_space[0],
                                  envs.action_space[0], args.hidden_size)
    else:
        actor_critic = []
        agents = []
        for agent_id in range(num_agents):
            ac = Policy(envs.observation_space[0],
                        envs.action_space[0],
                        num_agents=num_agents,
                        gain=args.gain,
                        base_kwargs={
                            'naive_recurrent': args.naive_recurrent_policy,
                            'recurrent': args.recurrent_policy,
                            'hidden_size': args.hidden_size,
                            'recurrent_N': args.recurrent_N,
                            'attn': args.attn,
                            'attn_only_critic': args.attn_only_critic,
                            'attn_size': args.attn_size,
                            'attn_N': args.attn_N,
                            'attn_heads': args.attn_heads,
                            'dropout': args.dropout,
                            'use_average_pool': args.use_average_pool,
                            'use_common_layer': args.use_common_layer,
                            'use_feature_normlization':
                            args.use_feature_normlization,
                            'use_feature_popart': args.use_feature_popart,
                            'use_orthogonal': args.use_orthogonal,
                            'layer_N': args.layer_N,
                            'use_ReLU': args.use_ReLU,
                            'use_same_dim': args.use_same_dim
                        },
                        device=device)
            ac.to(device)
            # algorithm
            agent = PPO(ac,
                        args.clip_param,
                        args.ppo_epoch,
                        args.num_mini_batch,
                        args.data_chunk_length,
                        args.value_loss_coef,
                        args.entropy_coef,
                        logger,
                        lr=args.lr,
                        eps=args.eps,
                        weight_decay=args.weight_decay,
                        max_grad_norm=args.max_grad_norm,
                        use_max_grad_norm=args.use_max_grad_norm,
                        use_clipped_value_loss=args.use_clipped_value_loss,
                        use_common_layer=args.use_common_layer,
                        use_huber_loss=args.use_huber_loss,
                        huber_delta=args.huber_delta,
                        use_popart=args.use_popart,
                        use_value_high_masks=args.use_value_high_masks,
                        device=device)

            actor_critic.append(ac)
            agents.append(agent)

        #replay buffer
        rollouts = RolloutStorage(num_agents, args.episode_length,
                                  args.n_rollout_threads,
                                  envs.observation_space[0],
                                  envs.action_space[0], args.hidden_size)

    # reset env
    obs, available_actions = envs.reset()

    # replay buffer
    if len(envs.observation_space[0]) == 3:
        share_obs = obs.reshape(args.n_rollout_threads, -1,
                                envs.observation_space[0][1],
                                envs.observation_space[0][2])
    else:
        share_obs = obs.reshape(args.n_rollout_threads, -1)

    share_obs = np.expand_dims(share_obs, 1).repeat(num_agents, axis=1)
    rollouts.share_obs[0] = share_obs.copy()
    rollouts.obs[0] = obs.copy()
    rollouts.available_actions[0] = available_actions.copy()
    rollouts.recurrent_hidden_states = np.zeros(
        rollouts.recurrent_hidden_states.shape).astype(np.float32)
    rollouts.recurrent_hidden_states_critic = np.zeros(
        rollouts.recurrent_hidden_states_critic.shape).astype(np.float32)

    # run
    start = time.time()
    episodes = int(
        args.num_env_steps) // args.episode_length // args.n_rollout_threads
    timesteps = 0
    last_battles_game = np.zeros(args.n_rollout_threads)
    last_battles_won = np.zeros(args.n_rollout_threads)

    for episode in range(episodes):
        if args.use_linear_lr_decay:  # decrease learning rate linearly
            if args.share_policy:
                update_linear_schedule(agents.optimizer, episode, episodes,
                                       args.lr)
            else:
                for agent_id in range(num_agents):
                    update_linear_schedule(agents[agent_id].optimizer, episode,
                                           episodes, args.lr)

        for step in range(args.episode_length):
            # Sample actions
            values = []
            actions = []
            action_log_probs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []

            with torch.no_grad():
                for agent_id in range(num_agents):
                    if args.share_policy:
                        actor_critic.eval()
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic.act(
                            agent_id,
                            torch.tensor(rollouts.share_obs[step, :,
                                                            agent_id]),
                            torch.tensor(rollouts.obs[step, :, agent_id]),
                            torch.tensor(
                                rollouts.recurrent_hidden_states[step, :,
                                                                 agent_id]),
                            torch.tensor(
                                rollouts.recurrent_hidden_states_critic[
                                    step, :, agent_id]),
                            torch.tensor(rollouts.masks[step, :, agent_id]),
                            torch.tensor(rollouts.available_actions[step, :,
                                                                    agent_id]))
                    else:
                        actor_critic[agent_id].eval()
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic[
                            agent_id].act(
                                agent_id,
                                torch.tensor(rollouts.share_obs[step, :,
                                                                agent_id]),
                                torch.tensor(rollouts.obs[step, :, agent_id]),
                                torch.tensor(rollouts.recurrent_hidden_states[
                                    step, :, agent_id]),
                                torch.tensor(
                                    rollouts.recurrent_hidden_states_critic[
                                        step, :, agent_id]),
                                torch.tensor(rollouts.masks[step, :,
                                                            agent_id]),
                                torch.tensor(
                                    rollouts.available_actions[step, :,
                                                               agent_id]))

                    values.append(value.detach().cpu().numpy())
                    actions.append(action.detach().cpu().numpy())
                    action_log_probs.append(
                        action_log_prob.detach().cpu().numpy())
                    recurrent_hidden_statess.append(
                        recurrent_hidden_states.detach().cpu().numpy())
                    recurrent_hidden_statess_critic.append(
                        recurrent_hidden_states_critic.detach().cpu().numpy())

            # rearrange action
            actions_env = []
            for i in range(args.n_rollout_threads):
                one_hot_action_env = []
                for agent_id in range(num_agents):
                    one_hot_action = np.zeros(envs.action_space[agent_id].n)
                    one_hot_action[actions[agent_id][i]] = 1
                    one_hot_action_env.append(one_hot_action)
                actions_env.append(one_hot_action_env)

            # Obser reward and next obs
            obs, reward, dones, infos, available_actions = envs.step(
                actions_env)

            # If done then clean the history of observations.
            # insert data in buffer
            masks = []
            for i, done in enumerate(dones):
                mask = []
                for agent_id in range(num_agents):
                    if done:
                        recurrent_hidden_statess[agent_id][i] = np.zeros(
                            args.hidden_size).astype(np.float32)
                        recurrent_hidden_statess_critic[agent_id][
                            i] = np.zeros(args.hidden_size).astype(np.float32)
                        mask.append([0.0])
                    else:
                        mask.append([1.0])
                masks.append(mask)

            bad_masks = []
            high_masks = []
            for info in infos:
                bad_mask = []
                high_mask = []
                for agent_id in range(num_agents):
                    if info[agent_id]['bad_transition']:
                        bad_mask.append([0.0])
                    else:
                        bad_mask.append([1.0])

                    if info[agent_id]['high_masks']:
                        high_mask.append([1.0])
                    else:
                        high_mask.append([0.0])
                bad_masks.append(bad_mask)
                high_masks.append(high_mask)

            if len(envs.observation_space[0]) == 3:
                share_obs = obs.reshape(args.n_rollout_threads, -1,
                                        envs.observation_space[0][1],
                                        envs.observation_space[0][2])
                share_obs = np.expand_dims(share_obs, 1).repeat(num_agents,
                                                                axis=1)

                rollouts.insert(
                    share_obs, obs,
                    np.array(recurrent_hidden_statess).transpose(1, 0, 2),
                    np.array(recurrent_hidden_statess_critic).transpose(
                        1, 0, 2),
                    np.array(actions).transpose(1, 0, 2),
                    np.array(action_log_probs).transpose(1, 0, 2),
                    np.array(values).transpose(1, 0, 2), reward, masks,
                    bad_masks, high_masks, available_actions)
            else:
                share_obs = obs.reshape(args.n_rollout_threads, -1)
                share_obs = np.expand_dims(share_obs, 1).repeat(num_agents,
                                                                axis=1)

                rollouts.insert(
                    share_obs, obs,
                    np.array(recurrent_hidden_statess).transpose(1, 0, 2),
                    np.array(recurrent_hidden_statess_critic).transpose(
                        1, 0, 2),
                    np.array(actions).transpose(1, 0, 2),
                    np.array(action_log_probs).transpose(1, 0, 2),
                    np.array(values).transpose(1, 0, 2), reward, masks,
                    bad_masks, high_masks, available_actions)

        with torch.no_grad():
            for agent_id in range(num_agents):
                if args.share_policy:
                    actor_critic.eval()
                    next_value, _, _ = actor_critic.get_value(
                        agent_id,
                        torch.tensor(rollouts.share_obs[-1, :, agent_id]),
                        torch.tensor(rollouts.obs[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states_critic[-1, :,
                                                                    agent_id]),
                        torch.tensor(rollouts.masks[-1, :, agent_id]))
                    next_value = next_value.detach().cpu().numpy()
                    rollouts.compute_returns(agent_id, next_value,
                                             args.use_gae, args.gamma,
                                             args.gae_lambda,
                                             args.use_proper_time_limits,
                                             args.use_popart,
                                             agents.value_normalizer)
                else:
                    actor_critic[agent_id].eval()
                    next_value, _, _ = actor_critic[agent_id].get_value(
                        agent_id,
                        torch.tensor(rollouts.share_obs[-1, :, agent_id]),
                        torch.tensor(rollouts.obs[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states[-1, :, agent_id]),
                        torch.tensor(
                            rollouts.recurrent_hidden_states_critic[-1, :,
                                                                    agent_id]),
                        torch.tensor(rollouts.masks[-1, :, agent_id]))
                    next_value = next_value.detach().cpu().numpy()
                    rollouts.compute_returns(agent_id, next_value,
                                             args.use_gae, args.gamma,
                                             args.gae_lambda,
                                             args.use_proper_time_limits,
                                             args.use_popart,
                                             agents[agent_id].value_normalizer)

        # update the network
        if args.share_policy:
            actor_critic.train()
            value_loss, action_loss, dist_entropy = agents.update_share(
                num_agents, rollouts)

            logger.add_scalars('reward', {'reward': np.mean(rollouts.rewards)},
                               (episode + 1) * args.episode_length *
                               args.n_rollout_threads)
        else:
            value_losses = []
            action_losses = []
            dist_entropies = []

            for agent_id in range(num_agents):
                actor_critic[agent_id].train()
                value_loss, action_loss, dist_entropy = agents[
                    agent_id].update(agent_id, rollouts)
                value_losses.append(value_loss)
                action_losses.append(action_loss)
                dist_entropies.append(dist_entropy)

                logger.add_scalars(
                    'agent%i/reward' % agent_id,
                    {'reward': np.mean(rollouts.rewards[:, :, agent_id])},
                    (episode + 1) * args.episode_length *
                    args.n_rollout_threads)

        # clean the buffer and reset
        rollouts.after_update()

        total_num_steps = (episode +
                           1) * args.episode_length * args.n_rollout_threads

        if (episode % args.save_interval == 0 or episode == episodes -
                1):  # save for every interval-th episode or for the last epoch
            if args.share_policy:
                torch.save({'model': actor_critic},
                           str(save_dir) + "/agent_model.pt")
            else:
                for agent_id in range(num_agents):
                    torch.save({'model': actor_critic[agent_id]},
                               str(save_dir) + "/agent%i_model" % agent_id +
                               ".pt")

        # log information
        if episode % args.log_interval == 0:
            end = time.time()
            print(
                "\n Map {} Algo {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(args.map_name, args.algorithm_name, episode, episodes,
                        total_num_steps, args.num_env_steps,
                        int(total_num_steps / (end - start))))
            if args.share_policy:
                print("value loss of agent: " + str(value_loss))
            else:
                for agent_id in range(num_agents):
                    print("value loss of agent%i: " % agent_id +
                          str(value_losses[agent_id]))

            if args.env_name == "StarCraft2":
                battles_won = []
                battles_game = []
                incre_battles_won = []
                incre_battles_game = []

                for i, info in enumerate(infos):
                    if 'battles_won' in info[0].keys():
                        battles_won.append(info[0]['battles_won'])
                        incre_battles_won.append(info[0]['battles_won'] -
                                                 last_battles_won[i])
                    if 'battles_game' in info[0].keys():
                        battles_game.append(info[0]['battles_game'])
                        incre_battles_game.append(info[0]['battles_game'] -
                                                  last_battles_game[i])

                if np.sum(incre_battles_game) > 0:
                    logger.add_scalars(
                        'incre_win_rate', {
                            'incre_win_rate':
                            np.sum(incre_battles_won) /
                            np.sum(incre_battles_game)
                        }, total_num_steps)
                else:
                    logger.add_scalars('incre_win_rate', {'incre_win_rate': 0},
                                       total_num_steps)
                last_battles_game = battles_game
                last_battles_won = battles_won

        if episode % args.eval_interval == 0 and args.eval:
            eval_battles_won = 0
            eval_episode = 0
            eval_obs, eval_available_actions = eval_env.reset()
            eval_share_obs = eval_obs.reshape(1, -1)
            eval_recurrent_hidden_states = np.zeros(
                (1, num_agents, args.hidden_size)).astype(np.float32)
            eval_recurrent_hidden_states_critic = np.zeros(
                (1, num_agents, args.hidden_size)).astype(np.float32)
            eval_masks = np.ones((1, num_agents, 1)).astype(np.float32)

            while True:
                eval_actions = []
                for agent_id in range(num_agents):
                    if args.share_policy:
                        actor_critic.eval()
                        _, action, _, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic.act(
                            agent_id,
                            torch.tensor(eval_share_obs),
                            torch.tensor(eval_obs[:, agent_id]),
                            torch.tensor(
                                eval_recurrent_hidden_states[:, agent_id]),
                            torch.tensor(
                                eval_recurrent_hidden_states_critic[:,
                                                                    agent_id]),
                            torch.tensor(eval_masks[:, agent_id]),
                            torch.tensor(eval_available_actions[:,
                                                                agent_id, :]),
                            deterministic=True)
                    else:
                        actor_critic[agent_id].eval()
                        _, action, _, recurrent_hidden_states, recurrent_hidden_states_critic = actor_critic[
                            agent_id].act(
                                agent_id,
                                torch.tensor(eval_share_obs),
                                torch.tensor(eval_obs[:, agent_id]),
                                torch.tensor(
                                    eval_recurrent_hidden_states[:, agent_id]),
                                torch.tensor(
                                    eval_recurrent_hidden_states_critic[:,
                                                                        agent_id]
                                ),
                                torch.tensor(eval_masks[:, agent_id]),
                                torch.tensor(
                                    eval_available_actions[:, agent_id, :]),
                                deterministic=True)

                    eval_actions.append(action.detach().cpu().numpy())
                    eval_recurrent_hidden_states[:,
                                                 agent_id] = recurrent_hidden_states.detach(
                                                 ).cpu().numpy()
                    eval_recurrent_hidden_states_critic[:,
                                                        agent_id] = recurrent_hidden_states_critic.detach(
                                                        ).cpu().numpy()

                # rearrange action
                eval_actions_env = []
                for agent_id in range(num_agents):
                    one_hot_action = np.zeros(
                        eval_env.action_space[agent_id].n)
                    one_hot_action[eval_actions[agent_id][0]] = 1
                    eval_actions_env.append(one_hot_action)

                # Obser reward and next obs
                eval_obs, eval_rewards, eval_dones, eval_infos, eval_available_actions = eval_env.step(
                    [eval_actions_env])
                eval_share_obs = eval_obs.reshape(1, -1)

                if eval_dones[0]:
                    eval_episode += 1
                    if eval_infos[0][0]['won']:
                        eval_battles_won += 1
                    for agent_id in range(num_agents):
                        eval_recurrent_hidden_states[0][agent_id] = np.zeros(
                            args.hidden_size).astype(np.float32)
                        eval_recurrent_hidden_states_critic[0][
                            agent_id] = np.zeros(args.hidden_size).astype(
                                np.float32)
                        eval_masks[0][agent_id] = 0.0
                else:
                    for agent_id in range(num_agents):
                        eval_masks[0][agent_id] = 1.0

                if eval_episode >= args.eval_episodes:
                    logger.add_scalars(
                        'eval_win_rate',
                        {'eval_win_rate': eval_battles_won / eval_episode},
                        total_num_steps)
                    break

    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
    envs.close()
    if args.eval:
        eval_env.close()
Exemple #9
0
def train(env, save_dir: str, epochs: int, update_epochs: int, agents: int,
          trajectory_steps: int, render: bool) -> object:
    reset_dir(save_dir)

    policy = MLPGaussianPolicy(
        env.observation_space.shape[0],
        env.action_space.shape[0],
        hidden_layers=[3, 2],
        action_high=env.action_space.high,
        action_low=env.action_space.low,
    )
    value_fn = MLPValueFn(env.observation_space.shape[0], hidden_layers=[3, 2])
    rl = PPO(policy, value_fn, update_epochs=update_epochs)

    reward_log = DataCSVSaver(
        os.path.join(save_dir, "distance.txt"),
        ("epoch", "averaged reward"))  # log epoch development of reward
    loss_log = DataCSVSaver(
        os.path.join(save_dir, "loss.txt"), ("epoch", "iter", "loss")
    )  # log loss transition to check whether network update is carried out properly

    e = 0
    # training
    while e < epochs:
        try:
            print("Epoch {} ......".format(e))
            rl.reset_sample_buffers()

            average_rewards = []

            # sampling
            print("  sampling...")
            for n in range(agents):
                # buffer to save samples from trajectory
                observations_list = []
                actions_list = []
                rewards_list = []

                # init
                obs = env.reset()
                observations_list.append(obs)

                # run a trajectory
                for t in range(trajectory_steps):
                    action = rl.act(obs)
                    obs, r, done, _ = env.step(action)
                    # save a sample
                    actions_list.append(action)
                    observations_list.append(obs)
                    rewards_list.append(r)
                    if done:
                        break

                rl.feed_trajectory_samples(observations_list, actions_list,
                                           rewards_list, done)

                print(
                    "    agent {}: run for {} steps, average reward {}".format(
                        n, t, np.mean(rewards_list)))
                average_rewards.append(np.mean(rewards_list))

            # update parameters of policy and state value function
            print("  updating...")
            update_epoch_losses = rl.update()

            # logging
            reward_log.append_data(
                e, np.mean(average_rewards))  # save averaged reward
            for i, loss in enumerate(update_epoch_losses):
                loss_log.append_data(e, i,
                                     loss)  # save loss of each update epoch

            print("  average reward {}".format(np.mean(average_rewards)))
            e += 1

        except KeyboardInterrupt:
            command = input("\nSample? Finish? : ")
            if command in ["sample", "Sample"]:
                # run for X steps
                sample_steps = input("How many steps for this sample?: ")
                if sample_steps == "":
                    sample_steps = steps
                    print("default steps {}".format(sample_steps))
                else:
                    sample_steps = int(sample_steps)

                obs = env.reset()
                acc_r = 0
                for t in range(sample_steps):
                    if render:
                        env.render()
                    action = rl.act(obs)
                    obs, r, done, _ = env.step(action)
                    acc_r += r

                continue
            if command in ["finish", "Finish"]:
                print("Ending training ...")
                break

    print("Finish training. Saving the policy and value_fn in {}".format(
        save_dir))
    rl.save(save_dir)
    return rl.policy
Exemple #10
0
def main():
    args = get_config()

    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(args.n_training_threads)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    args.reward_randomization = False

    # path
    model_dir = Path('./results') / args.env_name / args.algorithm_name / (
        "run" + str(args.seed))

    run_dir = model_dir / 'finetune'
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir))

    # env
    envs = make_parallel_env(args)
    #Policy network
    actor_critic = []
    if args.share_policy:
        ac = torch.load(str(model_dir / 'models') +
                        "/agent0_model.pt")['model'].to(device)
        for i in range(args.num_agents):
            actor_critic.append(ac)
    else:
        for i in range(args.num_agents):
            ac = torch.load(
                str(model_dir / 'models') + "/agent%i_model" % i +
                ".pt")['model'].to(device)
            actor_critic.append(ac)
    agents = []
    rollouts = []
    for agent_id in range(args.num_agents):
        # algorithm
        agent = PPO(actor_critic[agent_id],
                    agent_id,
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.data_chunk_length,
                    args.value_loss_coef,
                    args.entropy_coef,
                    logger,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm,
                    use_clipped_value_loss=args.use_clipped_value_loss)

        #replay buffer
        ro = RolloutStorage(args.num_agents, agent_id, args.episode_length,
                            args.n_rollout_threads,
                            envs.observation_space[agent_id],
                            envs.action_space[agent_id],
                            actor_critic[agent_id].recurrent_hidden_state_size)

        agents.append(agent)
        rollouts.append(ro)

    # reset env
    obs = envs.reset()
    # rollout
    for i in range(args.num_agents):
        if len(envs.observation_space[0]) == 1:
            rollouts[i].share_obs[0].copy_(
                torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
            rollouts[i].obs[0].copy_(torch.tensor(obs[:, i, :]))
            rollouts[i].recurrent_hidden_states.zero_()
            rollouts[i].recurrent_hidden_states_critic.zero_()
            rollouts[i].recurrent_c_states.zero_()
            rollouts[i].recurrent_c_states_critic.zero_()
        else:
            raise NotImplementedError
        rollouts[i].to(device)

    # run
    coop_num = []
    defect_num = []
    coopdefect_num = []
    defectcoop_num = []
    gore1_num = []
    gore2_num = []
    collective_return = []
    apple_consumption = []
    waste_cleared = []
    sustainability = []
    fire = []

    start = time.time()
    episodes = int(
        args.num_env_steps) // args.episode_length // args.n_rollout_threads
    all_episode = 0
    turn_on = False

    for episode in range(episodes):
        if (episode > episodes / 2) and (turn_on == False):
            print("\n Turn off fixed actor...")
            # actor
            turn_on = True

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            for i in range(args.num_agents):
                update_linear_schedule(agents[i].optimizer, episode, episodes,
                                       args.lr)

        for step in range(args.episode_length):
            # Sample actions
            values = []
            actions = []
            action_log_probs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []
            recurrent_c_statess = []
            recurrent_c_statess_critic = []

            with torch.no_grad():
                for i in range(args.num_agents):
                    value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic, recurrent_c_states, recurrent_c_states_critic = actor_critic[
                        i].act(
                            rollouts[i].share_obs[step], rollouts[i].obs[step],
                            rollouts[i].recurrent_hidden_states[step],
                            rollouts[i].recurrent_hidden_states_critic[step],
                            rollouts[i].recurrent_c_states[step],
                            rollouts[i].recurrent_c_states_critic[step],
                            rollouts[i].masks[step])
                    values.append(value)
                    actions.append(action)
                    action_log_probs.append(action_log_prob)
                    recurrent_hidden_statess.append(recurrent_hidden_states)
                    recurrent_hidden_statess_critic.append(
                        recurrent_hidden_states_critic)
                    recurrent_c_statess.append(recurrent_c_states)
                    recurrent_c_statess_critic.append(
                        recurrent_c_states_critic)

            # rearrange action
            actions_env = []
            for i in range(args.n_rollout_threads):
                one_hot_action_env = []
                for k in range(args.num_agents):
                    one_hot_action = np.zeros(envs.action_space[0].n)
                    one_hot_action[actions[k][i]] = 1
                    one_hot_action_env.append(one_hot_action)
                actions_env.append(one_hot_action_env)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(actions_env)

            # If done then clean the history of observations.
            # insert data in buffer
            masks = []
            bad_masks = []
            masks_critic = []
            bad_masks_critic = []
            for i in range(args.num_agents):
                mask = []
                bad_mask = []
                for done_ in done:
                    if done_[i]:
                        mask.append([0.0])
                        bad_mask.append([1.0])
                    else:
                        mask.append([1.0])
                        bad_mask.append([1.0])
                masks.append(torch.FloatTensor(mask))
                bad_masks.append(torch.FloatTensor(bad_mask))

            for i in range(args.num_agents):
                if len(envs.observation_space[0]) == 1:
                    rollouts[i].insert(
                        torch.tensor(obs.reshape(args.n_rollout_threads, -1)),
                        torch.tensor(obs[:,
                                         i, :]), recurrent_hidden_statess[i],
                        recurrent_hidden_statess_critic[i],
                        recurrent_c_statess[i], recurrent_c_statess_critic[i],
                        actions[i], action_log_probs[i], values[i],
                        torch.tensor(reward[:, i].reshape(-1, 1)), masks[i],
                        bad_masks[i])
                else:
                    raise NotImplementedError

        with torch.no_grad():
            next_values = []
            for i in range(args.num_agents):
                next_value = actor_critic[i].get_value(
                    rollouts[i].share_obs[-1], rollouts[i].obs[-1],
                    rollouts[i].recurrent_hidden_states[-1],
                    rollouts[i].recurrent_hidden_states_critic[-1],
                    rollouts[i].recurrent_c_states[-1],
                    rollouts[i].recurrent_c_states_critic[-1],
                    rollouts[i].masks[-1]).detach()
                next_values.append(next_value)

        for i in range(args.num_agents):
            rollouts[i].compute_returns(next_values[i], args.use_gae,
                                        args.gamma, args.gae_lambda,
                                        args.use_proper_time_limits)

        # update the network
        value_losses = []
        action_losses = []
        dist_entropies = []
        for i in range(args.num_agents):
            value_loss, action_loss, dist_entropy = agents[i].update(
                rollouts[i], turn_on)
            value_losses.append(value_loss)
            action_losses.append(action_loss)
            dist_entropies.append(dist_entropy)

        if args.env_name == "StagHunt":
            for info in infos:
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'defect&defect_num' in info.keys():
                    defect_num.append(info['defect&defect_num'])
                if 'coop&defect_num' in info.keys():
                    coopdefect_num.append(info['coop&defect_num'])
                if 'defect&coop_num' in info.keys():
                    defectcoop_num.append(info['defect&coop_num'])

            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'defect&defect_num_per_episode',
                    {'defect&defect_num_per_episode': defect_num[all_episode]},
                    all_episode)
                logger.add_scalars('coop&defect_num_per_episode', {
                    'coop&defect_num_per_episode':
                    coopdefect_num[all_episode]
                }, all_episode)
                logger.add_scalars('defect&coop_num_per_episode', {
                    'defect&coop_num_per_episode':
                    defectcoop_num[all_episode]
                }, all_episode)
                all_episode += 1
        elif args.env_name == "StagHuntGW":
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'gore1_num' in info.keys():
                    gore1_num.append(info['gore1_num'])
                if 'gore2_num' in info.keys():
                    gore2_num.append(info['gore2_num'])

            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'collective_return',
                    {'collective_return': collective_return[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore1_num_per_episode',
                    {'gore1_num_per_episode': gore1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore2_num_per_episode',
                    {'gore2_num_per_episode': gore2_num[all_episode]},
                    all_episode)

                all_episode += 1

        # clean the buffer and reset
        obs = envs.reset()
        for i in range(args.num_agents):
            if len(envs.observation_space[0]) == 1:
                rollouts[i].share_obs[0].copy_(
                    torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
                rollouts[i].obs[0].copy_(torch.tensor(obs[:, i, :]))
                rollouts[i].recurrent_hidden_states.zero_()
                rollouts[i].recurrent_hidden_states_critic.zero_()
                rollouts[i].recurrent_c_states.zero_()
                rollouts[i].recurrent_c_states_critic.zero_()
            else:
                raise NotImplementedError
            rollouts[i].to(device)

        for i in range(args.num_agents):
            # save for every interval-th episode or for the last epoch
            if (episode % args.save_interval == 0 or episode == episodes - 1):
                torch.save({'model': actor_critic[i]},
                           str(save_dir) + "/agent%i_model" % i + ".pt")

        # log information
        if episode % args.log_interval == 0:
            total_num_steps = (
                episode + 1) * args.episode_length * args.n_rollout_threads
            end = time.time()
            print(
                "\n updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(episode, episodes, total_num_steps, args.num_env_steps,
                        int(total_num_steps / (end - start))))
            for i in range(args.num_agents):
                print("value loss of agent%i: " % i + str(value_losses[i]))
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()