Beispiel #1
0
 def __init__(self,
              env_config,
              models: Dict[Tuple, BaseNeuralNet],
              lr=0.0003,
              gamma=0.998,
              clip_param=0.2,
              entropy_coef=0.025,
              value_loss_coef=0.05,
              device='cuda'):
     self.memory = dict()
     self.device = device
     self.env_config = env_config
     env = foundation.make_env_instance(**env_config)
     self.models = dict()
     obs = env.reset()
     for key, value in models.items():
         self.models[key] = value(device=self.device)
         self.models[key].build_models(ObservationBatch(obs, key))
         self.memory[key] = Memory()
     self.lr = lr
     self.gamma = gamma
     self.clip_param = clip_param
     self.entropy_coef = entropy_coef
     self.value_loss_coef = value_loss_coef
     self.optimizers = dict()
     for key, value in self.models.items():
         self.optimizers[key] = torch.optim.Adam(value.parameters(),
                                                 lr=self.lr)
Beispiel #2
0
 def __init__(self, env_config, force_dense_logging: bool = False):
     self.env: BaseEnvironment = foundation.make_env_instance(**{
         **ENV_CONF_DEFAULT,
         **env_config,
     })
     self.observation_space = OBS_SPACE_AGENT
     self.action_space = ACT_SPACE_AGENT
     self.force_dense_logging = force_dense_logging
Beispiel #3
0
    def __init__(self, env_config, verbose=False):
        self.env_config_dict = env_config["env_config_dict"]

        # Adding env id in the case of multiple environments
        if hasattr(env_config, "worker_index"):
            self.env_id = (
                env_config["num_envs_per_worker"] * (env_config.worker_index - 1)
            ) + env_config.vector_index
        else:
            self.env_id = None

        self.env = foundation.make_env_instance(**self.env_config_dict)
        self.verbose = verbose
        self.sample_agent_idx = str(self.env.all_agents[0].idx)

        obs = self.env.reset()

        self.observation_space = self._dict_to_spaces_dict(obs["0"])
        self.observation_space_pl = self._dict_to_spaces_dict(obs["p"])

        if self.env.world.agents[0].multi_action_mode:
            self.action_space = spaces.MultiDiscrete(
                self.env.get_agent(self.sample_agent_idx).action_spaces
            )
            self.action_space.dtype = np.int64
            self.action_space.nvec = self.action_space.nvec.astype(np.int64)

        else:
            self.action_space = spaces.Discrete(
                self.env.get_agent(self.sample_agent_idx).action_spaces
            )
            self.action_space.dtype = np.int64

        if self.env.world.planner.multi_action_mode:
            self.action_space_pl = spaces.MultiDiscrete(
                self.env.get_agent("p").action_spaces
            )
            self.action_space_pl.dtype = np.int64
            self.action_space_pl.nvec = self.action_space_pl.nvec.astype(np.int64)

        else:
            self.action_space_pl = spaces.Discrete(
                self.env.get_agent("p").action_spaces
            )
            self.action_space_pl.dtype = np.int64

        self._seed = None
        if self.verbose:
            print("[EnvWrapper] Spaces")
            print("[EnvWrapper] Obs (a)   ")
            pretty_print(self.observation_space)
            print("[EnvWrapper] Obs (p)   ")
            pretty_print(self.observation_space_pl)
            print("[EnvWrapper] Action (a)", self.action_space)
            print("[EnvWrapper] Action (p)", self.action_space_pl)
Beispiel #4
0
    def rollout(self, key, n_rollouts, num_steps):
        env = foundation.make_env_instance(**self.env_config)
        t = trange(n_rollouts, desc='Rollout')
        for rollout in t:
            obs = env.reset()
            states, actions, logprobs, rewards, values, done, hcs = [], [], [], [], [], [], []
            hc = (torch.zeros(1,
                              len(key),
                              self.models[key].lstm_size,
                              device=self.device),
                  torch.zeros(1,
                              len(key),
                              self.models[key].lstm_size,
                              device=self.device))
            for step in range(num_steps):
                obs_batch = ObservationBatch(obs, key)
                hcs.append(hc)
                dist, value, hc = self.models[key](obs_batch, hc)
                hc = (hc[0].detach(), hc[1].detach())
                a = dist.sample().detach()
                value = value.squeeze()
                logprob = dist.log_prob(a).detach()
                action_dict = dict(
                    (i, a.detach().cpu().numpy())
                    for i, a in zip(obs_batch.order, a.argmax(-1)))
                next_obs, rew, is_done, info = env.step(action_dict)
                states.append(obs_batch)
                actions.append(a.argmax(-1).detach())
                logprobs.append(logprob)
                rewards.append(np.array([rew[k] for k in key]))
                values.append(value)
                done.append(is_done['__all__'])

                obs = next_obs

            obs_batch = ObservationBatch(obs, key)
            _, next_value, hc = self.models[key](obs_batch, hc)
            next_value = next_value.detach().cpu().numpy()
            values = torch.stack(values).detach().cpu().numpy()
            discounted_rewards = self.compute_gae(next_value, rewards, done,
                                                  values)
            advantage = (discounted_rewards - values).tolist()
            discounted_rewards = discounted_rewards.tolist()
            self.memory[key].add_trace(states, actions, logprobs,
                                       discounted_rewards, advantage, hcs)
    def set_env_config(self):
        """Set up a sample environment config"""
        self.env_config = {
            # ===== STANDARD ARGUMENTS ======
            "n_agents": 4,  # Number of non-planner agents
            "world_size": [15, 15],  # [Height, Width] of the env world
            "episode_length": 1000,  # Number of time-steps per episode
            # In multi-action-mode, the policy selects an action for each action
            # subspace (defined in component code)
            # Otherwise, the policy selects only 1 action
            "multi_action_mode_agents": False,
            "multi_action_mode_planner": True,
            # When flattening observations, concatenate scalar & vector observations
            # before output
            # Otherwise, return observations with minimal processing
            "flatten_observations": False,
            # When Flattening masks, concatenate each action subspace mask
            # into a single array
            # Note: flatten_masks = True is recommended for masking action logits
            "flatten_masks": True,
            # ===== COMPONENTS =====
            # Which components to use
            "components": [
                # (1) Building houses
                {"Build": {}},
                # (2) Trading collectible resources
                {"ContinuousDoubleAuction": {"max_num_orders": 5}},
                # (3) Movement and resource collection
                {"Gather": {}},
            ],
            # ===== SCENARIO =====
            # Which scenario class to use
            "scenario_name": "uniform/simple_wood_and_stone",
            # (optional) kwargs of the chosen scenario class
            "starting_agent_coin": 10,
            "starting_stone_coverage": 0.10,
            "starting_wood_coverage": 0.10,
        }

        # Create an environment instance from the config
        self.env = foundation.make_env_instance(**self.env_config)
Beispiel #6
0
        {'Gather': {}},
    ],
    
    # ===== SCENARIO =====
    # Which scenario class to use (specified by the class's name in the Scenario Registry)
    'scenario_name': 'uniform/simple_wood_and_stone',
    
    # (optional) kwargs of the chosen scenario class
    'starting_agent_coin': 10,
    'starting_stone_coverage': 0.10,
    'starting_wood_coverage':  0.10,
}



env = foundation.make_env_instance(**env_config)
obs = env.reset()

uniform_cls = scenario_registry.get(env_config['scenario_name'])
isinstance(env, uniform_cls)
isinstance(env, BaseEnvironment)

@scenario_registry.add
class EmptyScenario(BaseEnvironment):
    name = "Empty"
    required_entities = []

    def reset_layout(self):
        """Resets the state of the world object (self.world)."""
        pass
Beispiel #7
0
def main():
    log_dir = Path(__file__).parent / f"exp{int(time.time())}"
    log_dir.mkdir()
    env = foundation.make_env_instance(**env_config)
    state = env.reset()

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    memory = [Memory() for _ in range(env.n_agents)]

    action_dim = state['0'][
        "action_mask"].size  # todo mask tells which action cannot be taken
    state_dim = state['0']["flat"].size

    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip, device)

    # logging variables
    running_reward = 0
    time_step = 0

    # training loop
    for i_episode in range(1, max_episodes + 1):
        if i_episode % graphical_episode_log_frequency == 0:
            obs = env.reset(force_dense_logging=True)
        else:
            obs = env.reset()

        for t in range(env.episode_length):
            time_step += 1
            actions = sample_random_actions(
                env, obs
            )  # Initialize dict with random actions then fill with selected values

            for agent_id in range(env.n_agents):
                agent_id_str = str(agent_id)
                memory_agent = memory[agent_id]
                action_mask = torch.tensor(state[agent_id_str]["action_mask"],
                                           device=device)
                agent_state = state[agent_id_str]["flat"]
                action = ppo.policy_old.act(agent_state, memory_agent,
                                            action_mask)
                actions[agent_id_str] = action

            state, reward, done, info = env.step(actions)
            # Saving reward and is_terminals:
            for agent_id in range(env.n_agents):
                agent_id_str = str(agent_id)
                memory_agent = memory[agent_id]
                agent_reward = -reward[agent_id_str]
                memory_agent.rewards.append(agent_reward)
                memory_agent.is_terminals.append(done)
                running_reward += agent_reward

                # update if its time
                if time_step % update_timestep == 0:
                    ppo.update(memory_agent)
                    memory_agent.clear_memory()
                    time_step = 0
            if done['__all__']:
                break

        # save every 500 episodes
        if i_episode % 100 == 0:
            torch.save(ppo.policy.state_dict(),
                       log_dir / f'ckpt-{i_episode}.pth')

        # logging
        if i_episode % log_interval == 0:
            running_reward = float((running_reward / log_interval))

            print(f'Episode {i_episode} \t Avg reward: {running_reward}')
            running_reward = 0

        if i_episode % graphical_episode_log_frequency == 0:
            dense_log = env.previous_episode_dense_log
            (fig0, fig1,
             fig2), incomes, endows, c_trades, all_builds = plotting.breakdown(
                 dense_log)
            print(f"{incomes=}, {endows=}, {c_trades=}, {all_builds=}")
            # fig0.savefig(log_dir / f"fig0-{i_episode}.png", dpi=fig0.dpi)
            fig1.savefig(log_dir / f"fig1-{i_episode:04d}.png", dpi=fig1.dpi)
            fig2.savefig(log_dir / f"fig2-{i_episode:04d}.png", dpi=fig2.dpi)
            plt.close(fig0)
            plt.close(fig1)
            plt.close(fig2)
            with open(log_dir / f'logs-{i_episode:04d}.pickle',
                      'wb') as handle:
                pickle.dump(dense_log,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)