def run_trial(episode_num): # TODO: agent_num cannot be pickled ? env = BipedalMultiCarrier(agent_num=c.agent_num) # render configuration if episode_num % c.profile_int == 0: render = True else: render = False frames = [] # batch size = 1 total_reward = t.zeros([c.agent_num, 1], device=c.device) state = t.tensor(env.reset(), dtype=t.float32, device=c.device).view(c.agent_num, -1) tmp_observe = [[] for _ in range(c.agent_num)] local_step = Counter() episode_finished = False while not episode_finished and local_step.get() <= c.max_steps: local_step.count() timer.begin() with t.no_grad(): old_state = state # agent model inference actions, prob, *_ = ppo.act({"state": state}) state, reward, episode_finished, _ = env.step( actions.flatten().to("cpu")) if render: frames.append(env.render(mode="rgb_array")) state = t.tensor(state, dtype=t.float32, device=c.device).view(c.agent_num, -1) reward = t.tensor(reward, dtype=t.float32, device=c.device).view(c.agent_num, -1) total_reward += reward for ag in range(c.agent_num): tmp_observe[ag].append({ "state": { "state": old_state[ag, :].unsqueeze(0).clone() }, "action": { "action": actions[ag, :].unsqueeze(0).clone() }, "next_state": { "state": state[ag, :].unsqueeze(0).clone() }, "reward": float(reward[ag]), "terminal": episode_finished or local_step.get() == c.max_steps, "action_log_prob": float(prob[ag]) }) # ordinary sampling, calculate value for each observation for ag in range(c.agent_num): tmp_observe[ag][-1]["value"] = tmp_observe[ag][-1]["reward"] for i in reversed(range(1, len(tmp_observe[ag]))): tmp_observe[ag][i - 1]["value"] = \ tmp_observe[ag][i]["value"] * c.discount + tmp_observe[ag][i - 1]["reward"] return it.chain( *tmp_observe), total_reward.mean(), local_step.get(), frames
nn.MSELoss(reduction='sum'), device, lr_scheduler=LambdaLR, lr_scheduler_params=[[actor_lr_func], [critic_lr_func]], replay_size=replay_size, batch_num=1) if not restart: ddpg.load(root_dir + "/model", save_map) logger.info("DDPG framework initialized") # training # begin training # epoch > episode epoch = Counter() episode = Counter() episode_finished = False global_step = Counter() local_step = Counter() while epoch < max_epochs: epoch.count() logger.info("Begin epoch {}".format(epoch)) while episode < max_episodes: episode.count() logger.info("Begin episode {}, epoch={}".format(episode, epoch)) # environment initialization env.reset() generate_combat_map(env, map_size, agent_ratio, group1_handle, group2_handle)
update_times=c.ppo_update_times, batch_size=c.ppo_update_batch_size, learning_rate=c.learning_rate) if c.restart_from_trial is not None: ppo.load(save_env.get_trial_model_dir()) logger.info("PPO framework initialized") # training # preparations ctx = get_context("spawn") pool = Pool(processes=c.workers, context=ctx) pool.enable_global_find(True) # begin training episode = Counter(step=c.ppo_update_int) timer = Timer() while episode < c.max_episodes: first_episode = episode.get() episode.count() last_episode = episode.get() - 1 logger.info("Begin episode {}-{} at {}".format( first_episode, last_episode, dt.now().strftime("%m/%d-%H:%M:%S"))) # begin trials def run_trial(episode_num): # TODO: agent_num cannot be pickled ? env = BipedalMultiCarrier(agent_num=c.agent_num)
operators = [(framework1, run_agents1, load_framework1), (framework2, run_agents2, load_framework2)] # testing # preparations config = generate_combat_config(map_size) env = magent.GridWorld(config, map_size=map_size) env.reset() global_board.init(test_root_dir) writer = global_board.writer logger.info("Directories prepared.") # begin training episode = Counter() episode_finished = False wins = [0, 0] while episode < max_episodes: episode.count() logger.info("Begin episode {} at {}".format(episode, dt.now().strftime("%m/%d-%H:%M:%S"))) # environment initialization env.reset() env.set_render_dir(test_root_dir) group_handles = env.get_handles() generate_combat_map(env, map_size, agent_ratio, group_handles[0], group_handles[1]) # batch size = 1
def run_trial(episode_num): config = generate_combat_config(c.map_size) env = magent.GridWorld(config, map_size=c.map_size) env.reset() group_handles = env.get_handles() generate_combat_map(env, c.map_size, c.agent_ratio, group_handles[0], group_handles[1]) # render configuration if episode_num % c.profile_int == 0: path = save_env.get_trial_image_dir() + "/{}".format(episode) save_env.create_dirs([path]) env.set_render_dir(path) render = True else: render = False # batch size = 1 total_reward = [0, 0] agent_alive_ids = [[ag for ag in range(agent_num)] for _ in (0, 1)] agent_dead_ids = [[] for _ in (0, 1)] agent_alive_history = [[] for _ in (0, 1)] agent_real_nums = [None, None] tmp_observes = [[[] for _ in range(agent_num)] for __ in (0, 1)] local_step = Counter() episode_finished = False while not episode_finished and local_step.get() <= c.max_steps: local_step.count() timer.begin() with t.no_grad(): agent_status = [Object(), Object()] for g in (0, 1): agent_real_nums[g], agent_status[g].actions, agent_status[g].probs, \ agent_status[g].views, agent_status[g].features = \ run_agents(env, ppo, group_handles[g]) episode_finished = env.step() # reward and is_alive must be get before clear_dead() ! reward = [env.get_reward(h) for h in group_handles] is_alive = [env.get_alive(h) for h in group_handles] for g in (0, 1): # remove dead ids agent_alive_ids[g] = [id for id, is_alive in zip(agent_alive_ids[g], is_alive[g]) if is_alive] agent_dead_ids[g] += [id for id, is_alive in zip(agent_alive_ids[g], is_alive[g]) if not is_alive] agent_alive_history[0].append(np.sum(is_alive[0])) agent_alive_history[1].append(np.sum(is_alive[1])) total_reward[0] += np.mean(reward[0]) total_reward[1] += np.mean(reward[1]) if render: env.render() if local_step.get() > 1: for g in (0, 1): for aid, idx in zip(agent_alive_ids[g], range(agent_real_nums[g])): status = agent_status[g] tmp_observes[g][aid].append( {"state": {"view": status.views[idx].unsqueeze(0).clone(), "feature": status.features[idx].unsqueeze(0).clone()}, "action": {"action": status.actions[idx].unsqueeze(0).clone()}, "next_state": {}, "reward": float(reward[g][idx]), "terminal": episode_finished or local_step.get() == c.max_steps, "action_log_prob": float(status.probs[idx]) } ) for aid in agent_dead_ids[g]: tmp_observes[g][aid][-1]["terminal"] = True env.clear_dead() # ordinary sampling, calculate value for each observation for g in (0, 1): for ag in range(agent_num): tmp_observe = tmp_observes[g][ag] tmp_observe[-1]["value"] = tmp_observe[-1]["reward"] for i in reversed(range(1, len(tmp_observe))): tmp_observe[i - 1]["value"] = \ tmp_observe[i]["value"] * c.discount + tmp_observe[i - 1]["reward"] tmp_observes = [tmp_observes[g][ag] for g in (0, 1) for ag in range(agent_num)] return list(it.chain(*tmp_observes))[:int(c.replay_size / c.ppo_update_int)], \ total_reward, local_step.get(), agent_alive_history
negotiator, len(neighbors), action_dim, observe_dim, history_depth, mean_anneal=nego_mean_anneal, theta_anneal=nego_theta_anneal, batch_size=1, contiguous=True, device=device) for i in range(agent_num) ] # begin evaluation # epoch > episode episode_finished = False local_step = Counter() #check_model(writer, critic, global_step, name="critic") #check_model(writer, base_actor, global_step, name="actor") logger.info("Begin testing") for agent in agents: agent.reset() ### currently, agents have fixed communication topology for i in range(agent_num): agent_neighbors = [] for j in neighbors: index = i + j if agent_num > index >= 0:
def run_trial(episode_num): env = BipedalWalker() # render configuration if episode_num % c.profile_int == 0: render = True else: render = False frames = [] # batch size = 1 total_reward = 0 state, reward = t.tensor(env.reset(), dtype=t.float32, device=c.device), 0 tmp_observe = [] local_step = Counter() episode_finished = False while not episode_finished and local_step.get() <= c.max_steps: local_step.count() timer.begin() with t.no_grad(): old_state = state # agent model inference action, prob, *_ = ppo.act({"state": state.unsqueeze(0)}) state, reward, episode_finished, _ = env.step( action[0].to("cpu")) if render: frames.append(env.render(mode="rgb_array")) state = t.tensor(state, dtype=t.float32, device=c.device) total_reward += reward tmp_observe.append({ "state": { "state": old_state.unsqueeze(0).clone() }, "action": { "action": action.clone() }, "next_state": { "state": state.unsqueeze(0).clone() }, "reward": float(reward), "terminal": episode_finished or local_step.get() == c.max_steps, "action_log_prob": float(prob) }) # ordinary sampling, calculate value for each observation tmp_observe[-1]["value"] = tmp_observe[-1]["reward"] for i in reversed(range(1, len(tmp_observe))): tmp_observe[i - 1]["value"] = \ tmp_observe[i]["value"] * c.discount + tmp_observe[i - 1]["reward"] return tmp_observe, total_reward, local_step.get(), frames