コード例 #1
0
        def run_trial(episode_num):
            # TODO: agent_num cannot be pickled ?
            env = BipedalMultiCarrier(agent_num=c.agent_num)

            # render configuration
            if episode_num % c.profile_int == 0:
                render = True
            else:
                render = False
            frames = []

            # batch size = 1
            total_reward = t.zeros([c.agent_num, 1], device=c.device)
            state = t.tensor(env.reset(), dtype=t.float32,
                             device=c.device).view(c.agent_num, -1)

            tmp_observe = [[] for _ in range(c.agent_num)]
            local_step = Counter()
            episode_finished = False

            while not episode_finished and local_step.get() <= c.max_steps:
                local_step.count()
                timer.begin()
                with t.no_grad():
                    old_state = state

                    # agent model inference
                    actions, prob, *_ = ppo.act({"state": state})

                    state, reward, episode_finished, _ = env.step(
                        actions.flatten().to("cpu"))

                    if render:
                        frames.append(env.render(mode="rgb_array"))

                    state = t.tensor(state, dtype=t.float32,
                                     device=c.device).view(c.agent_num, -1)
                    reward = t.tensor(reward, dtype=t.float32,
                                      device=c.device).view(c.agent_num, -1)

                    total_reward += reward

                    for ag in range(c.agent_num):
                        tmp_observe[ag].append({
                            "state": {
                                "state": old_state[ag, :].unsqueeze(0).clone()
                            },
                            "action": {
                                "action": actions[ag, :].unsqueeze(0).clone()
                            },
                            "next_state": {
                                "state": state[ag, :].unsqueeze(0).clone()
                            },
                            "reward":
                            float(reward[ag]),
                            "terminal":
                            episode_finished
                            or local_step.get() == c.max_steps,
                            "action_log_prob":
                            float(prob[ag])
                        })

            # ordinary sampling, calculate value for each observation
            for ag in range(c.agent_num):
                tmp_observe[ag][-1]["value"] = tmp_observe[ag][-1]["reward"]
                for i in reversed(range(1, len(tmp_observe[ag]))):
                    tmp_observe[ag][i - 1]["value"] = \
                        tmp_observe[ag][i]["value"] * c.discount + tmp_observe[ag][i - 1]["reward"]

            return it.chain(
                *tmp_observe), total_reward.mean(), local_step.get(), frames
コード例 #2
0
ファイル: magent_tcdn.py プロジェクト: mrshenli/machin
                nn.MSELoss(reduction='sum'),
                device,
                lr_scheduler=LambdaLR,
                lr_scheduler_params=[[actor_lr_func], [critic_lr_func]],
                replay_size=replay_size,
                batch_num=1)

    if not restart:
        ddpg.load(root_dir + "/model", save_map)
    logger.info("DDPG framework initialized")

    # training

    # begin training
    # epoch > episode
    epoch = Counter()
    episode = Counter()
    episode_finished = False
    global_step = Counter()
    local_step = Counter()
    while epoch < max_epochs:
        epoch.count()
        logger.info("Begin epoch {}".format(epoch))
        while episode < max_episodes:
            episode.count()
            logger.info("Begin episode {}, epoch={}".format(episode, epoch))

            # environment initialization
            env.reset()
            generate_combat_map(env, map_size, agent_ratio, group1_handle,
                                group2_handle)
コード例 #3
0
              update_times=c.ppo_update_times,
              batch_size=c.ppo_update_batch_size,
              learning_rate=c.learning_rate)

    if c.restart_from_trial is not None:
        ppo.load(save_env.get_trial_model_dir())
    logger.info("PPO framework initialized")

    # training
    # preparations
    ctx = get_context("spawn")
    pool = Pool(processes=c.workers, context=ctx)
    pool.enable_global_find(True)

    # begin training
    episode = Counter(step=c.ppo_update_int)
    timer = Timer()

    while episode < c.max_episodes:
        first_episode = episode.get()
        episode.count()
        last_episode = episode.get() - 1
        logger.info("Begin episode {}-{} at {}".format(
            first_episode, last_episode,
            dt.now().strftime("%m/%d-%H:%M:%S")))

        # begin trials
        def run_trial(episode_num):
            # TODO: agent_num cannot be pickled ?
            env = BipedalMultiCarrier(agent_num=c.agent_num)
コード例 #4
0
    operators = [(framework1, run_agents1, load_framework1),
                 (framework2, run_agents2, load_framework2)]

    # testing
    # preparations
    config = generate_combat_config(map_size)
    env = magent.GridWorld(config, map_size=map_size)
    env.reset()

    global_board.init(test_root_dir)
    writer = global_board.writer
    logger.info("Directories prepared.")

    # begin training
    episode = Counter()
    episode_finished = False
    wins = [0, 0]

    while episode < max_episodes:
        episode.count()
        logger.info("Begin episode {} at {}".format(episode, dt.now().strftime("%m/%d-%H:%M:%S")))

        # environment initialization
        env.reset()
        env.set_render_dir(test_root_dir)

        group_handles = env.get_handles()
        generate_combat_map(env, map_size, agent_ratio, group_handles[0], group_handles[1])

        # batch size = 1
コード例 #5
0
        def run_trial(episode_num):
            config = generate_combat_config(c.map_size)
            env = magent.GridWorld(config, map_size=c.map_size)
            env.reset()

            group_handles = env.get_handles()
            generate_combat_map(env, c.map_size, c.agent_ratio, group_handles[0], group_handles[1])

            # render configuration
            if episode_num % c.profile_int == 0:
                path = save_env.get_trial_image_dir() + "/{}".format(episode)
                save_env.create_dirs([path])
                env.set_render_dir(path)
                render = True
            else:
                render = False

            # batch size = 1
            total_reward = [0, 0]
            agent_alive_ids = [[ag for ag in range(agent_num)] for _ in (0, 1)]
            agent_dead_ids = [[] for _ in (0, 1)]
            agent_alive_history = [[] for _ in (0, 1)]
            agent_real_nums = [None, None]
            tmp_observes = [[[] for _ in range(agent_num)] for __ in (0, 1)]

            local_step = Counter()
            episode_finished = False

            while not episode_finished and local_step.get() <= c.max_steps:
                local_step.count()
                timer.begin()

                with t.no_grad():
                    agent_status = [Object(), Object()]
                    for g in (0, 1):
                        agent_real_nums[g], agent_status[g].actions, agent_status[g].probs, \
                        agent_status[g].views, agent_status[g].features = \
                            run_agents(env, ppo, group_handles[g])

                    episode_finished = env.step()
                    # reward and is_alive must be get before clear_dead() !
                    reward = [env.get_reward(h) for h in group_handles]
                    is_alive = [env.get_alive(h) for h in group_handles]

                    for g in (0, 1):
                        # remove dead ids
                        agent_alive_ids[g] = [id for id, is_alive in
                                              zip(agent_alive_ids[g], is_alive[g])
                                              if is_alive]
                        agent_dead_ids[g] += [id for id, is_alive in
                                              zip(agent_alive_ids[g], is_alive[g])
                                              if not is_alive]

                    agent_alive_history[0].append(np.sum(is_alive[0]))
                    agent_alive_history[1].append(np.sum(is_alive[1]))

                    total_reward[0] += np.mean(reward[0])
                    total_reward[1] += np.mean(reward[1])

                    if render:
                        env.render()

                    if local_step.get() > 1:
                        for g in (0, 1):
                            for aid, idx in zip(agent_alive_ids[g], range(agent_real_nums[g])):
                                status = agent_status[g]
                                tmp_observes[g][aid].append(
                                    {"state": {"view": status.views[idx].unsqueeze(0).clone(),
                                               "feature": status.features[idx].unsqueeze(0).clone()},
                                     "action": {"action": status.actions[idx].unsqueeze(0).clone()},
                                     "next_state": {},
                                     "reward": float(reward[g][idx]),
                                     "terminal": episode_finished or local_step.get() == c.max_steps,
                                     "action_log_prob": float(status.probs[idx])
                                     }
                                )
                            for aid in agent_dead_ids[g]:
                                tmp_observes[g][aid][-1]["terminal"] = True

                    env.clear_dead()


            # ordinary sampling, calculate value for each observation
            for g in (0, 1):
                for ag in range(agent_num):
                    tmp_observe = tmp_observes[g][ag]
                    tmp_observe[-1]["value"] = tmp_observe[-1]["reward"]
                    for i in reversed(range(1, len(tmp_observe))):
                        tmp_observe[i - 1]["value"] = \
                            tmp_observe[i]["value"] * c.discount + tmp_observe[i - 1]["reward"]

            tmp_observes = [tmp_observes[g][ag] for g in (0, 1) for ag in range(agent_num)]

            return list(it.chain(*tmp_observes))[:int(c.replay_size / c.ppo_update_int)], \
                   total_reward, local_step.get(), agent_alive_history
コード例 #6
0
ファイル: mcarrier_tcdn_test.py プロジェクト: mrshenli/machin
                   negotiator,
                   len(neighbors),
                   action_dim,
                   observe_dim,
                   history_depth,
                   mean_anneal=nego_mean_anneal,
                   theta_anneal=nego_theta_anneal,
                   batch_size=1,
                   contiguous=True,
                   device=device) for i in range(agent_num)
    ]

    # begin evaluation
    # epoch > episode
    episode_finished = False
    local_step = Counter()

    #check_model(writer, critic, global_step, name="critic")
    #check_model(writer, base_actor, global_step, name="actor")

    logger.info("Begin testing")

    for agent in agents:
        agent.reset()

    ### currently, agents have fixed communication topology
    for i in range(agent_num):
        agent_neighbors = []
        for j in neighbors:
            index = i + j
            if agent_num > index >= 0:
コード例 #7
0
        def run_trial(episode_num):
            env = BipedalWalker()

            # render configuration
            if episode_num % c.profile_int == 0:
                render = True
            else:
                render = False
            frames = []

            # batch size = 1
            total_reward = 0
            state, reward = t.tensor(env.reset(),
                                     dtype=t.float32,
                                     device=c.device), 0

            tmp_observe = []
            local_step = Counter()
            episode_finished = False

            while not episode_finished and local_step.get() <= c.max_steps:
                local_step.count()
                timer.begin()
                with t.no_grad():
                    old_state = state

                    # agent model inference
                    action, prob, *_ = ppo.act({"state": state.unsqueeze(0)})

                    state, reward, episode_finished, _ = env.step(
                        action[0].to("cpu"))

                    if render:
                        frames.append(env.render(mode="rgb_array"))

                    state = t.tensor(state, dtype=t.float32, device=c.device)

                    total_reward += reward

                    tmp_observe.append({
                        "state": {
                            "state": old_state.unsqueeze(0).clone()
                        },
                        "action": {
                            "action": action.clone()
                        },
                        "next_state": {
                            "state": state.unsqueeze(0).clone()
                        },
                        "reward":
                        float(reward),
                        "terminal":
                        episode_finished or local_step.get() == c.max_steps,
                        "action_log_prob":
                        float(prob)
                    })

            # ordinary sampling, calculate value for each observation
            tmp_observe[-1]["value"] = tmp_observe[-1]["reward"]
            for i in reversed(range(1, len(tmp_observe))):
                tmp_observe[i - 1]["value"] = \
                    tmp_observe[i]["value"] * c.discount + tmp_observe[i - 1]["reward"]

            return tmp_observe, total_reward, local_step.get(), frames