def __init__(self, env_name='harvest', num_agents=1): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(ascii_map=HARVEST_MAP_CPR, num_agents=num_agents, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=num_agents, render=True) else: print('Error! Not a valid environment type') return self.num_agents = num_agents self.agent_policies = [] self.agents = list(self.env.agents.values()) # print(agents[0].action_space) self.action_dim = self.agents[0].action_space.n for _ in range(num_agents): # TODO right now only using 1 frame, update later to look back x (e.g. 4) frames. Later RNN/LSTM neural_net = ConvFC( conv_in_channels= 3, # harvest specific input is 15x15x3 (HARVEST_VIEW_SIZE = 7) conv_out_channels=3, input_size=15, hidden_size=64, output_size=self.action_dim) self.agent_policies.append( DQNAgent(0, self.action_dim - 1, neural_net)) self.env.reset()
def test_harvest_map(): env = HarvestEnv(ascii_map=MINI_HARVEST_MAP, num_agents=1) env.reset() agents = list(env.agents.values()) action_dim = agents[0].action_space.n for i in range(action_dim): env.step({'agent-0': i})
def __init__(self, env_name='cleanup'): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(num_agents=5, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=5, render=True) else: print('Error! Not a valid environment type') return self.env.reset()
def __init__(self, args): self.env_name = args.env if self.env_name == "harvest": print("Initializing Harvest environment") self.env = HarvestEnv(num_agents=5) elif self.env_name == "cleanup": print("Initializing Cleanup environment") self.env = CleanupEnv(num_agents=5) elif self.env_name == "switch": print("Initializing Switch environment") self.env = SwitchEnv(args, num_agents=1) else: print("Error! Not a valid environment type") return self.env.reset()
def env_creator(_): ascii_map = HARVEST_MAP if harvest_map == 'tiny': ascii_map = HARVEST_MAP_TINY elif harvest_map == 'toy': ascii_map = HARVEST_MAP_TOY elif harvest_map == 'cpr': # note only single agent at present ascii_map = HARVEST_MAP_CPR elif harvest_map == 'big': ascii_map = HARVEST_MAP_BIG created_env = HarvestEnv(ascii_map=ascii_map, num_agents=num_agents, ir_param_list=ir_param_list, hit_penalty=hit_penalty, fire_cost=fire_cost) return created_env
def env_creator(env_config=None): num_agents = env_config["num_agents"] return HarvestEnv(num_agents=num_agents)
def env_creator(env_config): return HarvestEnv(env_config)
def setup(env, hparams, algorithm, train_batch_size, num_cpus, num_gpus, num_agents, num_symbols, grid_search, use_gpus_for_workers=False, use_gpu_for_driver=False, num_workers_per_device=1): obs_space = None act_space = None if env == 'harvest': obs_space = HarvestEnv.observation_space(num_agents, num_symbols) act_space = HarvestEnv.action_space(num_agents, num_symbols) def env_creator(env_config): return HarvestEnv(env_config) else: obs_space = CleanupEnv.observation_space(num_agents, num_symbols) act_space = CleanupEnv.action_space(num_agents, num_symbols) def env_creator(env_config): return CleanupEnv(env_config) env_name = env + "_env" register_env(env_name, env_creator) # register the custom model ModelCatalog.register_custom_model(MODEL_NAME, ObedienceLSTM) # Each policy can have a different configuration (including custom model) def gen_policy(): return None, obs_space, act_space, {'custom_model': MODEL_NAME} # Setup with an ensemble of `num_policies` different policy graphs policy_graphs = {} for i in range(num_agents): policy_graphs['agent-' + str(i)] = gen_policy() def policy_mapping_fn(agent_id): return agent_id # gets the A3C trainer and its default config # source at https://github.com/ray-project/ray/blob/d537e9f0d8b84414a2aba7a7d0a68d59241f1490/rllib/agents/a3c/a3c.py agent_cls = get_agent_class(algorithm) config = agent_cls._default_config.copy() # information for replay config['env_config']['func_create'] = env_creator config['env_config']['env_name'] = env_name # config['env_config']['run'] = algorithm config['callbacks']['on_postprocess_traj'] = on_postprocess_traj # Calculate device configurations gpus_for_driver = int(use_gpu_for_driver) cpus_for_driver = 1 - gpus_for_driver if use_gpus_for_workers: spare_gpus = (num_gpus - gpus_for_driver) num_workers = int(spare_gpus * num_workers_per_device) num_gpus_per_worker = spare_gpus / num_workers num_cpus_per_worker = 0 else: spare_cpus = (num_cpus - cpus_for_driver) num_workers = int(spare_cpus * num_workers_per_device) num_gpus_per_worker = 0 num_cpus_per_worker = spare_cpus / num_workers # hyperparams config.update({ "train_batch_size": train_batch_size, "sample_batch_size": 50, # "batch_mode": "complete_episodes", # "metrics_smoothing_episodes": 1, "vf_loss_coeff": 0.1, "horizon": 1000, "gamma": 0.99, "lr_schedule": [[0, hparams['lr_init']], [20000000, hparams['lr_final']]], "num_workers": num_workers, "num_gpus": num_gpus, # The number of GPUs for the driver "num_cpus_for_driver": cpus_for_driver, "num_gpus_per_worker": num_gpus_per_worker, # Can be a fraction "num_cpus_per_worker": num_cpus_per_worker, # Can be a fraction "entropy_coeff": hparams['entropy_coeff'], "multiagent": { "policies": policy_graphs, "policy_mapping_fn": policy_mapping_fn, }, "model": { "custom_model": MODEL_NAME, #"custom_preprocessor": "nothing", "use_lstm": False, "custom_options": { "num_agents": num_agents, "num_symbols": num_symbols, "fcnet_hiddens": [32, 32], "cell_size": 128, }, "conv_filters": [[6, [3, 3], 1]], #"lstm_cell_size": 128 # conv filters?? }, "env_config": { "num_agents": num_agents, "num_symbols": num_symbols, "obedience_weight": .001, "leadership_weight": .001, }, }) if args.algorithm == "PPO": config.update({ "num_sgd_iter": 10, "sgd_minibatch_size": 500, "vf_loss_coeff": 1e-4 }) if args.grid_search: pass return algorithm, env_name, config
def env_creator(_): return HarvestEnv(num_agents=num_agents)
def setup(env, hparams, algorithm, train_batch_size, num_cpus, num_gpus, num_agents, use_gpus_for_workers=False, use_gpu_for_driver=False, num_workers_per_device=1): if env == 'harvest': def env_creator(_): return HarvestEnv(num_agents=num_agents) single_env = HarvestEnv() elif env == "harvest_comm": def env_creator(_): return HarvestCommEnv(num_agents=num_agents) single_env = HarvestCommEnv() else: def env_creator(_): return CleanupEnv(num_agents=num_agents) single_env = CleanupEnv() env_name = env + "_env" register_env(env_name, env_creator) obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(): return (PPOPolicyGraph, obs_space, act_space, {}) # Setup PPO with an ensemble of `num_policies` different policy graphs policy_graphs = {} for i in range(num_agents): policy_graphs['agent-' + str(i)] = gen_policy() def policy_mapping_fn(agent_id): return agent_id # register the custom model model_name = "conv_to_fc_net" ModelCatalog.register_custom_model(model_name, ConvToFCNet) agent_cls = get_agent_class(algorithm) config = agent_cls._default_config.copy() # information for replay config['env_config']['func_create'] = tune.function(env_creator) config['env_config']['env_name'] = env_name config['env_config']['run'] = algorithm # Calculate device configurations gpus_for_driver = int(use_gpu_for_driver) cpus_for_driver = 1 - gpus_for_driver if use_gpus_for_workers: spare_gpus = (num_gpus - gpus_for_driver) num_workers = int(spare_gpus * num_workers_per_device) num_gpus_per_worker = spare_gpus / num_workers num_cpus_per_worker = 0 else: spare_cpus = (num_cpus - cpus_for_driver) num_workers = int(spare_cpus * num_workers_per_device) num_gpus_per_worker = 0 num_cpus_per_worker = spare_cpus / num_workers # hyperparams config.update({ "train_batch_size": train_batch_size, "horizon": 1000, "lr_schedule": [[0, hparams['lr_init']], [20000000, hparams['lr_final']]], "num_workers": num_workers, "num_gpus": gpus_for_driver, # The number of GPUs for the driver "num_cpus_for_driver": cpus_for_driver, "num_gpus_per_worker": num_gpus_per_worker, # Can be a fraction "num_cpus_per_worker": num_cpus_per_worker, # Can be a fraction "entropy_coeff": hparams['entropy_coeff'], "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(policy_mapping_fn), }, "model": { "custom_model": "conv_to_fc_net", "use_lstm": True, "lstm_cell_size": 128 } }) return algorithm, env_name, config
def env_creator(_): return HarvestEnv( num_agents=num_agents, return_agent_actions=True, use_collective_reward=args.use_collective_reward, )
def __call__(self): return HarvestEnv(ascii_map=MINI_HARVEST_MAP, num_agents=self._num_agents)