예제 #1
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    if config.save_gifs:
        gif_path = model_path.parent / 'gifs'
        gif_path.mkdir(exist_ok=True)

    model = AttentionSAC.init_from_save(model_path)
    env = make_env(config.env_id, discrete_action=True)
    model.prep_rollouts(device='cpu')
    ifi = 1 / config.fps  # inter-frame interval

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        if config.save_gifs:
            frames = []
            frames.append(env.render('rgb_array')[0])
        env.render('human')
        for t_i in range(config.episode_length):
            calc_start = time.time()
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_actions = model.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)
            if config.save_gifs:
                frames.append(env.render('rgb_array')[0])
            calc_end = time.time()
            elapsed = calc_end - calc_start
            if elapsed < ifi:
                time.sleep(ifi - elapsed)
            env.render('human')
        if config.save_gifs:
            gif_num = 0
            while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
                gif_num += 1
            imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
                            frames,
                            duration=ifi)

    env.close()
예제 #2
0
파일: simulate.py 프로젝트: Makiah/MAAC
def run(model_name: str):
    model_path, run_num, run_dir, log_dir = run_setup(model_name, get_latest_model=True)

    if model_path is None:
        print("Couldn't find model!")
        return

    model = AttentionSAC.init_from_save(model_path)

    model.prep_rollouts(device='cpu')

    run_env: HaliteRunHelper = HaliteRunHelper()

    run_env.simulate(lambda o: model.step(o, explore=True), agent_count=2)
예제 #3
0
def run(config):
    env = football_env.create_environment(
        env_name=config["academy_scenario"],
        rewards=config["scoring"],
        render=config["render_mode"],
        number_of_left_players_agent_controls=config["num_to_control"],
        representation='raw')

    model = AttentionSAC.init_from_save(
        "./models/football/MAAC3/run2/model.pt", True)
    # (** EDITED **) Set Replay Buffer
    # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정

    for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]):
        obs = env.reset()
        obs = make_state(obs)
        model.prep_rollouts(device='cpu')

        for et_i in range(config["episode_length"]):
            print("episode : {} | step : {}".format(ep_i, et_i), end='\r')
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config["n_rollout_threads"])]

            # Reform Actions list to fit on Football Env
            # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음
            actions_list = [[np.argmax(b) for b in a] for a in actions]

            # Step
            next_obs, rewards, dones, infos = env.step(actions_list)
            next_obs = make_state(next_obs)

            # Prevention of divergence
            # 안해주면 발산해서 학습 불가 (NaN)
            rewards = rewards - 0.000001

            # Reform Done Flag list
            # replay buffer에 알맞도록 done 리스트 재구성
            obs = next_obs

    env.close()
예제 #4
0
def run(config):
    model_path = (Path('./models') / config.env_id / config.model_name /
                  ('run%i' % config.run_num))
    if config.incremental is not None:
        model_path = model_path / 'incremental' / ('model_ep%i.pt' %
                                                   config.incremental)
    else:
        model_path = model_path / 'model.pt'

    maac = AttentionSAC.init_from_save(model_path)
    env = MultiAgentEnv(config.env_id, config.n_controlled_lagents,
                        config.n_controlled_ragents, config.reward_type,
                        config.render)
    maac.prep_rollouts(device='cpu')

    goal_diff = 0

    for ep_i in range(config.n_episodes):
        print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
        obs = env.reset()
        for t_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False)
                for i in range(maac.nagents)
            ]
            # get actions as torch Variables
            torch_actions = maac.step(torch_obs, explore=False)
            # convert actions to numpy arrays
            actions = [ac.data.numpy().flatten() for ac in torch_actions]
            obs, rewards, dones, infos = env.step(actions)
            if all(dones):
                goal_diff += np.sum(rewards) / (config.n_controlled_lagents +
                                                config.n_controlled_ragents)
            if all(dones):
                break
    goal_diff /= config.n_episodes
    print(goal_diff)
    env.close()
예제 #5
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(1804)
    np.random.seed(1804)
    # initialize E parallel environments with N agents
    env = make_parallel_env(config.env_id, config.n_rollout_threads, 1804)
    model = AttentionSAC.init_from_save('model.pt')
    # model = AttentionSAC.init_from_env(env,
    #                                    tau=config.tau,
    #                                    pi_lr=config.pi_lr,
    #                                    q_lr=config.q_lr,
    #                                    gamma=config.gamma,
    #                                    pol_hidden_dim=config.pol_hidden_dim,
    #                                    critic_hidden_dim=config.critic_hidden_dim,
    #                                    attend_heads=config.attend_heads,
    #                                    reward_scale=config.reward_scale)
    # initialize replay buffer D
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])

    # T_update
    t = 0
    max_step = 0
    max_time = 0
    total_step = np.zeros(model.nagents)
    total_time = np.zeros(model.nagents)
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        success = np.zeros((config.n_rollout_threads, model.nagents),
                           dtype=bool)
        steps = np.zeros((config.n_rollout_threads, model.nagents))
        time_cost = np.zeros((config.n_rollout_threads, model.nagents))
        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]

            start = time.clock()
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)
            end = time.clock()
            per_time_cost = end - start

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)

            # calculate steps
            success = np.logical_or(success, dones)
            # steps += dones
            steps += np.logical_not(dones)
            time_cost += np.logical_not(dones) * per_time_cost

            # store transitions for all env in replay buffer
            # replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs

            # T_update = T_update + E
            t += config.n_rollout_threads

            # if (len(replay_buffer) >= max(config.pi_batch_size, config.q_batch_size) and
            #     (t % config.steps_per_update) < config.n_rollout_threads):
            #     if config.use_gpu:
            #         model.prep_training(device='gpu')
            #     else:
            #         model.prep_training(device='cpu')
            #     for u_i in range(config.num_critic_updates):
            #         sample = replay_buffer.sample(config.q_batch_size,
            #                                       to_gpu=config.use_gpu)
            #         model.update_critic(sample, logger=logger)
            #     for u_i in range(config.num_pol_updates):
            #         sample = replay_buffer.sample(config.pi_batch_size,
            #                                       to_gpu=config.use_gpu)
            #         model.update_policies(sample, logger=logger)
            #     model.update_all_targets()
            #     # for u_i in range(config.num_updates):
            #     #     sample = replay_buffer.sample(config.batch_size,
            #     #                                   to_gpu=config.use_gpu)
            #     #     model.update_critic(sample, logger=logger)
            #     #     model.update_policies(sample, logger=logger)
            #     #     model.update_all_targets()
            model.prep_rollouts(device='cpu')

        # ep_dones = np.mean(success, axis=0)
        # ep_steps = 1 - np.mean(steps / config.episode_length, axis=0)
        # ep_mean_step

        # ep_rews = replay_buffer.get_average_rewards(
        #     config.episode_length * config.n_rollout_threads)
        # for a_i, a_ep_rew in enumerate(ep_rews):
        #     logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
        # for a_i, a_ep_done in enumerate(ep_dones):
        # logger.add_scalar('agent%i/mean_episode_dones' % a_i, a_ep_done, ep_i)
        # for a_i, a_ep_step in enumerate(ep_steps):
        # logger.add_scalar('agent%i/mean_episode_steps' % a_i, a_ep_step, ep_i)

        total_step += np.mean(steps, axis=0)
        total_time += np.mean(time_cost, axis=0)

        max_step += np.max(steps)
        max_time += np.max(time_cost)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            # os.makedirs(run_dir / 'incremental', exist_ok=True)
            # model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            # model.save(run_dir / 'model.pt')

    mean_step = total_step / (100 / config.n_rollout_threads)
    mean_time = total_time / (100 / config.n_rollout_threads)
    max_time /= 100 / config.n_rollout_threads
    max_step /= 100 / config.n_rollout_threads

    print('; '.join([
        f'{chr(65 + i)} Mean Step:{mean_step[i]}, Mean Time:{mean_time[i]}'
        for i in range(model.nagents)
    ]))
    print('Mean Max Step:{}, Mean Max Time Cost:{}'.format(max_step, max_time))
    # model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #6
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
                         model_dir.iterdir() if
                         str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    # model = AttentionSAC.init_from_env(env,
    #                                    tau=config.tau,
    #                                    pi_lr=config.pi_lr,
    #                                    q_lr=config.q_lr,
    #                                    gamma=config.gamma,
    #                                    pol_hidden_dim=config.pol_hidden_dim,
    #                                    critic_hidden_dim=config.critic_hidden_dim,
    #                                    attend_heads=config.attend_heads,
    #                                    reward_scale=config.reward_scale)

    # Model used to test with adversarial agent 
    # model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run140\\model.pt")
    # print("Model instantiated")

    # Model used to test without adversarial agent 
    model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run148\\model.pt")
    print("Model instantiated")

    replay_buffer = ReplayBuffer(config.buffer_length, model.nagents,
                                 [obsp.shape[0] for obsp in env.observation_space],
                                 [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
                                  for acsp in env.action_space])
    t = 0

    row_list = []

    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print("Episodes %i-%i of %i" % (ep_i + 1,
                                        ep_i + 1 + config.n_rollout_threads,
                                        config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
                                  requires_grad=False)
                         for i in range(model.nagents)]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            # print (rewards)
            # print (dones[0])
            # env.render('human')
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_updates):
                    sample = replay_buffer.sample(config.batch_size,
                                                  to_gpu=config.use_gpu)
                    #print(sample)
                    model.update_critic(sample, logger=logger)
                    model.update_policies(sample, logger=logger)
                    model.update_all_targets()
                model.prep_rollouts(device='cpu')

            if (dones[0][0]):
                print("Breakin the epsiodeeeee at timestep", et_i)
                break
        
        et_i += 1   

        row_list.append((ep_i+1,et_i))   

        ep_rews = replay_buffer.get_average_rewards(
            et_i * config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                              a_ep_rew * et_i, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    with open('Timesteps_vs_Episodes.csv', 'w', newline='') as file:
         writer = csv.writer(file)
         writer.writerow(["Ep No", "Number of Timesteps"])
         for row in row_list:
            writer.writerow(row)

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #7
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    run_num = 1

    numWolves = 3
    numSheep = 1
    numBlocks = 2
    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt')

    biteList = []
    trajListToRender = []
    for ep_i in range(0, config.n_episodes):
        obs = env.reset()
        model.prep_rollouts(device='cpu')
        trajectory = []

        for et_i in range(config.episode_length):  #25
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            torch_agent_actions = model.step(torch_obs, explore=False)
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            state = [
                np.append(agent.state.p_pos, agent.state.p_vel)
                for agent in env.agents
            ] + [
                np.append(landmark.state.p_pos, landmark.state.p_vel)
                for landmark in env.world.landmarks
            ]

            state = obs[0]
            action = actions[0]
            reward = rewards[0]
            nextState = next_obs[0]
            trajectory.append((state, action, reward, nextState))

            obs = next_obs

        biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10)
        biteList.append(biteNum)
        trajListToRender = trajListToRender + trajectory

        print(biteNum)

    meanTrajBite = np.mean(biteList)
    seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1)
    print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite)

    wolfColor = np.array([0.85, 0.35, 0.35])
    sheepColor = np.array([0.35, 0.85, 0.35])
    blockColor = np.array([0.25, 0.25, 0.25])
    entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [
        blockColor
    ] * numBlocks
    render = Render(entitiesSizeList, entitiesColorList, numAgents,
                    getPosFromAgentState)
    trajToRender = np.concatenate(trajListToRender)
    render(trajToRender)

    env.close()
예제 #8
0
파일: train.py 프로젝트: Makiah/MAAC
def run(halite_env: BaseEnv, load_latest: bool=False):
    config = halite_env.config

    model_path, run_num, run_dir, log_dir = run_setup(config.model_name, get_latest_model=load_latest)

    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    # Build MAAC model
    if model_path is None:
        model = AttentionSAC(halite_env.agent_type_topologies,
                             tau=config.tau,
                             pi_lr=config.pi_lr,
                             q_lr=config.q_lr,
                             gamma=config.gamma,
                             pol_hidden_dim=config.pol_hidden_dim,
                             critic_hidden_dim=config.critic_hidden_dim,
                             attend_heads=config.attend_heads,
                             reward_scale=config.reward_scale)
    else:
        model = AttentionSAC.init_from_save(model_path, load_critic=True)

    # Build replay buffer
    replay_buffer = ReplayBuffer(config.buffer_length)

    prev_time = time.perf_counter()

    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        curr_time = time.perf_counter()
        print("Episodes %i-%i of %i (%is)" % (ep_i + 1,
                                              ep_i + 1 + config.n_rollout_threads,
                                              config.n_episodes,
                                              (curr_time - prev_time)))
        model.prep_rollouts(device='cpu')

        game_reward = halite_env.simulate(lambda o: model.step(o, explore=True), replay_buffer)

        t += config.n_rollout_threads
        if (replay_buffer.length() >= config.batch_size and
            (t % config.games_per_update) < config.n_rollout_threads):
            print("Training")
            if config.use_gpu:
                model.prep_training(device='gpu')
            else:
                model.prep_training(device='cpu')
            for u_i in range(config.num_updates):
                sample: List[Dict[AgentKey, AgentReplayFrame]] = replay_buffer.sample(config.batch_size)
                # print("Original sample size", len(sample))
                # print("Preprocessing to batch structure")
                sample: Dict[AgentKey, BatchedAgentReplayFrame] = preprocess_to_batch(sample, to_gpu=config.use_gpu)
                # print("Filtered sample size", len(sample))
                # if len(sample) < 5:
                #     print("Sample size keys:", sample.keys())
                # print("Updating model critic")
                model.update_critic(sample, logger=logger)
                # print("Updating model policies")
                model.update_policies(sample, logger=logger)
                model.update_all_targets()
            model.prep_rollouts(device='cpu')

        ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads)
        for k, v in ep_rews.items():
            logger.add_scalar('agent%s/mean_episode_rewards' % str(k), v, ep_i)

        logger.add_scalar("global_env_rewards", game_reward, ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            print("Saving")
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
            model.save(run_dir / 'model.pt')
            print("run_dir", run_dir)

        prev_time = curr_time

    model.save(run_dir / 'model.pt')
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
예제 #9
0
def test(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        # runs the newest
        run_num = max(exst_run_nums)

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    # Initialization of evaluation metrics
    collisions = [0]
    success_nums = [0]
    ccr_activates = [0]
    final_ep_rewards = []  # sum of rewards for training curve
    final_ep_collisions = []
    final_ep_activates = []
    final_ep_success_nums = []

    torch.manual_seed(run_num)
    np.random.seed(run_num)

    env = make_env(config.env_id, discrete_action=True)
    env.seed(run_num)
    np.random.seed(run_num)
    model = AttentionSAC.init_from_save(run_dir / 'model.pt', True)

    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0

    #### remove all tensorboard methods, replace with print and pickle

    for ep_i in range(0, config.n_episodes):

        obs = np.expand_dims(np.array(env.reset()), 0)
        model.prep_rollouts(device='cpu')

        t_start = time.time()

        prev_obs = None
        act_n_t_minus_1 = None

        for et_i in range(config.episode_length):
            if config.CCR:
                if act_n_t_minus_1:
                    target_obs_n, _, _, _ = env.oracle_step(act_n_t_minus_1[0])

                    target_obs_n = np.expand_dims(np.array(target_obs_n), 0)

                    diff_state = obs[:, :, :4] - target_obs_n[:, :, :
                                                              4]  # 1x4x4

                    if config.env_id == 'wall':
                        diff_obs = obs[:, :, -(model.nagents + 8 + 1)]
                    elif config.env_id == 'turbulence':
                        diff_obs = obs[:, :, -(model.nagents + 2 + 1)]
                    else:
                        assert (False)

                    emerg_n = np.sum(diff_state**2, axis=-1) + diff_obs  # 1x4

                    env.oracle_update()

                    # obs: 1x4x20
                    # emerg_n: 1x4
                    for agent_i in range(model.nagents):
                        for agent_j in range(model.nagents):
                            obs[:, agent_i, -agent_j] = emerg_n[:, agent_j]

            # collect experience
            if prev_obs is not None:
                replay_buffer.push(prev_obs, agent_actions, rewards, obs,
                                   dones)

            #print(obs)
            # convert observation to torch Variable
            torch_obs = []
            for i in range(model.nagents):
                torch_obs.append(
                    Variable(torch.Tensor(obs[:, i]), requires_grad=False))
            # print(torch_obs)
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=False)

            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]

            # rearrange actions to be per environment
            actions = [[ac[0] for ac in agent_actions]]

            # rearrange actions to be per environment
            #actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]

            next_obs, rewards, dones, infos = env.step(actions[0])

            next_obs = np.expand_dims(np.array(next_obs), 0)
            rewards = np.expand_dims(np.array(rewards), 0)
            dones = np.expand_dims(np.array(dones), 0)
            infos = np.expand_dims(np.array(infos), 0)

            if config.CCR:
                act_n_t_minus_1 = actions

            prev_obs = obs

            obs = next_obs

            t += 1

            # for displaying learned policies
            if config.display:
                time.sleep(0.1)
                env.render()
                continue

    env.close()
예제 #10
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    run_num = 1

    numWolves = 3
    numSheep = 1
    numBlocks = 2
    numAgents = numWolves + numSheep
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [
        blockSize
    ] * numBlocks
    sheepMaxSpeed = 1.3
    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheep + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    collisionReward = 10
    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID,
                              sheepsID,
                              entitiesSizeList,
                              getPosFromAgentState,
                              isCollision,
                              punishForOutOfBound,
                              collisionPunishment=collisionReward)

    individualRewardWolf = 0
    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision,
                            collisionReward, individualRewardWolf)
    reshapeAction = ReshapeAction()
    costActionRatio = 0
    getActionCost = GetActionCost(costActionRatio,
                                  reshapeAction,
                                  individualCost=True)
    getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID]
    rewardWolfWithActionCost = lambda state, action, nextState: np.array(
        rewardWolf(state, action, nextState)) - np.array(
            getActionCost(getWolvesAction(action)))

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)
    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: [False] * numAgents

    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape for obsID in range(len(initObsForParams))
    ]
    worldDim = 2
    actionSpace = [
        spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents)
    ]

    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    model = AttentionSAC.init_from_save(filename=run_dir / 'model.pt')

    biteList = []
    trajListToRender = []

    for ep_i in range(0, config.n_episodes):
        state = reset()
        model.prep_rollouts(device='cpu')

        trajectory = []

        for et_i in range(config.episode_length):
            obs = observe(state)
            obs = np.array([obs])
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            torch_agent_actions = model.step(torch_obs, explore=False)
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            action = actions[0]

            nextState = transit(state, action)
            next_obs = observe(nextState)
            rewards = rewardFunc(state, action, nextState)
            done_n = isTerminal(nextState)
            done = all(done_n)
            trajectory.append((state, action, rewards, nextState))

            state = nextState

        biteNum = calcWolfTrajBiteAmount(trajectory, wolvesID, singleReward=10)
        biteList.append(biteNum)
        trajListToRender.append(list(trajectory))

        print(biteNum)

    meanTrajBite = np.mean(biteList)
    seTrajBite = np.std(biteList) / np.sqrt(len(biteList) - 1)
    print('meanTrajBite', meanTrajBite, 'seTrajBite ', seTrajBite)

    wolfColor = np.array([0.85, 0.35, 0.35])
    sheepColor = np.array([0.35, 0.85, 0.35])
    blockColor = np.array([0.25, 0.25, 0.25])
    entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [
        blockColor
    ] * numBlocks
    render = Render(entitiesSizeList, entitiesColorList, numAgents,
                    getPosFromAgentState)
    trajToRender = np.concatenate(trajListToRender)
    render(trajToRender)