def _setup_runner(self): self.status = Trial.RUNNING trainable_cls = get_registry().get( TRAINABLE_CLASS, self.trainable_name) cls = ray.remote( num_cpus=self.resources.driver_cpu_limit, num_gpus=self.resources.driver_gpu_limit)(trainable_cls) if not self.result_logger: if not os.path.exists(self.local_dir): os.makedirs(self.local_dir) self.logdir = tempfile.mkdtemp( prefix="{}_{}".format( str(self)[:MAX_LEN_IDENTIFIER], date_str()), dir=self.local_dir) self.result_logger = UnifiedLogger( self.config, self.logdir, self.upload_dir) remote_logdir = self.logdir def logger_creator(config): # Set the working dir in the remote process, for user file writes if not os.path.exists(remote_logdir): os.makedirs(remote_logdir) os.chdir(remote_logdir) return NoopLogger(config, remote_logdir) # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. self.runner = cls.remote( config=self.config, registry=get_registry(), logger_creator=logger_creator)
def testGymPreprocessors(self): p1 = ModelCatalog.get_preprocessor( get_registry(), gym.make("CartPole-v0")) assert type(p1) == NoPreprocessor p2 = ModelCatalog.get_preprocessor( get_registry(), gym.make("FrozenLake-v0")) assert type(p2) == OneHotPreprocessor
def testGymPreprocessors(self): p1 = ModelCatalog.get_preprocessor( get_registry(), gym.make("CartPole-v0")) self.assertEqual(type(p1), NoPreprocessor) p2 = ModelCatalog.get_preprocessor( get_registry(), gym.make("FrozenLake-v0")) self.assertEqual(type(p2), OneHotPreprocessor)
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model( get_registry(), np.zeros((10, 3), dtype=np.float32), 5) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model( get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5) self.assertEqual(type(p2), VisionNetwork)
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model( get_registry(), np.zeros((10, 3), dtype=np.float32), 5) assert type(p1) == FullyConnectedNetwork with tf.variable_scope("test2"): p2 = ModelCatalog.get_model( get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5) assert type(p2) == VisionNetwork
def testCustomPreprocessor(self): ray.init() ModelCatalog.register_custom_preprocessor("foo", CustomPreprocessor) ModelCatalog.register_custom_preprocessor("bar", CustomPreprocessor2) env = gym.make("CartPole-v0") p1 = ModelCatalog.get_preprocessor( get_registry(), env, {"custom_preprocessor": "foo"}) self.assertEqual(str(type(p1)), str(CustomPreprocessor)) p2 = ModelCatalog.get_preprocessor( get_registry(), env, {"custom_preprocessor": "bar"}) self.assertEqual(str(type(p2)), str(CustomPreprocessor2)) p3 = ModelCatalog.get_preprocessor(get_registry(), env) self.assertEqual(type(p3), NoPreprocessor)
def testCustomPreprocessor(self): ray.init() ModelCatalog.register_custom_preprocessor("foo", CustomPreprocessor) ModelCatalog.register_custom_preprocessor("bar", CustomPreprocessor2) env = gym.make("CartPole-v0") p1 = ModelCatalog.get_preprocessor( get_registry(), env, {"custom_preprocessor": "foo"}) assert type(p1) == CustomPreprocessor p2 = ModelCatalog.get_preprocessor( get_registry(), env, {"custom_preprocessor": "bar"}) assert type(p2) == CustomPreprocessor2 p3 = ModelCatalog.get_preprocessor(get_registry(), env) assert type(p3) == NoPreprocessor
def __init__(self, config={}, env=None, registry=get_registry(), logger_creator=None): """Initialize an RLLib agent. Args: config (dict): Algorithm-specific configuration data. env (str): Name of the environment to use. Note that this can also be specified as the `env` key in config. registry (obj): Object registry for user-defined envs, models, etc. If unspecified, it will be assumed empty. logger_creator (func): Function that creates a ray.tune.Logger object. If unspecified, a default logger is created. """ self._initialize_ok = False self._experiment_id = uuid.uuid4().hex env = env or config.get("env") if env: config["env"] = env if registry and registry.contains(ENV_CREATOR, env): self.env_creator = registry.get(ENV_CREATOR, env) else: import gym # soft dependency self.env_creator = lambda env_config: gym.make(env) else: self.env_creator = lambda env_config: None self.config = self._default_config.copy() self.registry = registry self.config = _deep_update(self.config, config, self._allow_unknown_configs, self._allow_unknown_subkeys) if logger_creator: self._result_logger = logger_creator(self.config) self.logdir = self._result_logger.logdir else: logdir_suffix = "{}_{}_{}".format( env, self._agent_name, datetime.today().strftime("%Y-%m-%d_%H-%M-%S")) if not os.path.exists(DEFAULT_RESULTS_DIR): os.makedirs(DEFAULT_RESULTS_DIR) self.logdir = tempfile.mkdtemp(prefix=logdir_suffix, dir=DEFAULT_RESULTS_DIR) self._result_logger = UnifiedLogger(self.config, self.logdir, None) self._iteration = 0 self._time_total = 0.0 self._timesteps_total = 0 with tf.Graph().as_default(): self._init() self._initialize_ok = True
def setUp(self): ray.init(num_cpus=1) config = DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["observation_filter"] = "ConcurrentMeanStdFilter" config["reward_filter"] = "MeanStdFilter" config["batch_size"] = 2 self._temp_dir = tempfile.mkdtemp("a3c_evaluator_test") self.e = A3CEvaluator(get_registry(), lambda config: gym.make("CartPole-v0"), config, logdir=self._temp_dir)
def _setup_runner(self): self.status = Trial.RUNNING trainable_cls = get_registry().get(TRAINABLE_CLASS, self.trainable_name) cls = ray.remote( num_cpus=self.resources.driver_cpu_limit, num_gpus=self.resources.driver_gpu_limit)(trainable_cls) if not self.result_logger: if not os.path.exists(self.local_dir): os.makedirs(self.local_dir) self.logdir = tempfile.mkdtemp(prefix=str(self), dir=self.local_dir) self.result_logger = UnifiedLogger(self.config, self.logdir, self.upload_dir) remote_logdir = self.logdir # Logging for trials is handled centrally by TrialRunner, so # configure the remote runner to use a noop-logger. self.runner = cls.remote( config=self.config, registry=get_registry(), logger_creator=lambda config: NoopLogger(config, remote_logdir))
def testTuplePreprocessor(self): ray.init() class TupleEnv(object): def __init__(self): self.observation_space = Tuple( [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)]) p1 = ModelCatalog.get_preprocessor( get_registry(), TupleEnv()) self.assertEqual(p1.shape, (8,)) self.assertEqual( list(p1.transform((0, [1, 2, 3]))), [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
def setUp(self): ray.init(num_cpus=1) config = DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["observation_filter"] = "ConcurrentMeanStdFilter" config["reward_filter"] = "MeanStdFilter" config["batch_size"] = 2 self._temp_dir = tempfile.mkdtemp("a3c_evaluator_test") self.e = A3CEvaluator( get_registry(), lambda config: gym.make("CartPole-v0"), config, logdir=self._temp_dir)
def __init__( self, config={}, env=None, registry=get_registry(), logger_creator=None): """Initialize an RLLib agent. Args: config (dict): Algorithm-specific configuration data. env (str): Name of the environment to use. Note that this can also be specified as the `env` key in config. registry (obj): Object registry for user-defined envs, models, etc. If unspecified, the default registry will be used. logger_creator (func): Function that creates a ray.tune.Logger object. If unspecified, a default logger is created. """ # Agents allow env ids to be passed directly to the constructor. self._env_id = env or config.get("env") Trainable.__init__(self, config, registry, logger_creator)
def get_compute_action_rllib(path_to_dir, checkpoint_num, alg): """Collect the compute_action method from RLlib's serialized files. Parameters ---------- path_to_dir : str RLlib directory containing training results checkpoint_num : int checkpoint number / training iteration of the learned policy alg : str name of the RLlib algorithm that was used during the training procedure Returns ------- method the compute_action method from the algorithm along with the trained parameters """ # collect the configuration information from the RLlib checkpoint result_dir = path_to_dir if path_to_dir[-1] != '/' else path_to_dir[:-1] config = get_rllib_config(result_dir) # run on only one cpu for rendering purposes ray.init(num_cpus=1) config["num_workers"] = 1 # create and register a gym+rllib env flow_params = get_flow_params(config) create_env, env_name = make_create_env(params=flow_params, version=9999, render=False) register_env(env_name, create_env) # recreate the agent agent_cls = get_agent_class(alg) agent = agent_cls(env=env_name, registry=get_registry(), config=config) # restore the trained parameters into the policy checkpoint = result_dir + '/checkpoint-{}'.format(checkpoint_num) agent._restore(checkpoint) return agent.compute_action
def __init__(self, config=None, registry=None, logger_creator=None): """Initialize an Trainable. Subclasses should prefer defining ``_setup()`` instead of overriding ``__init__()`` directly. Args: config (dict): Trainable-specific configuration data. registry (obj): Object registry for user-defined envs, models, etc. If unspecified, the default registry will be used. logger_creator (func): Function that creates a ray.tune.Logger object. If unspecified, a default logger is created. """ if registry is None: from ray.tune.registry import get_registry registry = get_registry() self._initialize_ok = False self._experiment_id = uuid.uuid4().hex self.config = config or {} self.registry = registry if logger_creator: self._result_logger = logger_creator(self.config) self.logdir = self._result_logger.logdir else: logdir_prefix = datetime.today().strftime("%Y-%m-%d_%H-%M-%S") if not os.path.exists(DEFAULT_RESULTS_DIR): os.makedirs(DEFAULT_RESULTS_DIR) self.logdir = tempfile.mkdtemp( prefix=logdir_prefix, dir=DEFAULT_RESULTS_DIR) self._result_logger = UnifiedLogger(self.config, self.logdir, None) self._iteration = 0 self._time_total = 0.0 self._timesteps_total = 0 self._setup() self._initialize_ok = True self._local_ip = ray.services.get_node_ip_address()
def __init__(self, config=None, registry=None, logger_creator=None): """Initialize an Trainable. Subclasses should prefer defining ``_setup()`` instead of overriding ``__init__()`` directly. Args: config (dict): Trainable-specific configuration data. registry (obj): Object registry for user-defined envs, models, etc. If unspecified, the default registry will be used. logger_creator (func): Function that creates a ray.tune.Logger object. If unspecified, a default logger is created. """ if registry is None: from ray.tune.registry import get_registry registry = get_registry() self._initialize_ok = False self._experiment_id = uuid.uuid4().hex self.config = config or {} self.registry = registry if logger_creator: self._result_logger = logger_creator(self.config) self.logdir = self._result_logger.logdir else: logdir_prefix = datetime.today().strftime("%Y-%m-%d_%H-%M-%S") if not os.path.exists(DEFAULT_RESULTS_DIR): os.makedirs(DEFAULT_RESULTS_DIR) self.logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=DEFAULT_RESULTS_DIR) self._result_logger = UnifiedLogger(self.config, self.logdir, None) self._iteration = 0 self._time_total = 0.0 self._timesteps_total = 0 self._setup() self._initialize_ok = True self._local_ip = ray.services.get_node_ip_address()
def test_ray(self): """ Integration test for ray/rllib + flow """ # Test 1: test_two_level_ray config = ppo.DEFAULT_CONFIG.copy() num_workers = 1 ray.init(num_cpus=num_workers, redirect_output=False) config["num_workers"] = num_workers config["timesteps_per_batch"] = min(HORIZON * num_workers, 128) config["num_sgd_iter"] = 1 config["model"].update({"fcnet_hiddens": [3, 3]}) config["gamma"] = 0.999 config["min_steps_per_task"] = HORIZON config["horizon"] = HORIZON config["sgd_batchsize"] = 4 additional_env_params = { "target_velocity": 8, "scenario_type": LoopScenario } additional_net_params = { "length": 260, "lanes": 1, "speed_limit": 30, "resolution": 40 } vehicle_params = [ dict(veh_id="rl", num_vehicles=1, acceleration_controller=(RLController, {}), routing_controller=(ContinuousRouter, {})), dict(veh_id="idm", num_vehicles=21, acceleration_controller=(IDMController, {}), routing_controller=(ContinuousRouter, {})) ] flow_params = dict(sumo=dict(sim_step=0.1, no_step_log=False), env=dict(horizon=HORIZON, additional_params=additional_env_params), net=dict(no_internal_links=False, additional_params=additional_net_params), veh=vehicle_params, initial=dict(spacing="uniform", bunching=30, min_gap=0)) flow_env_name = "WaveAttenuationPOEnv" create_env, env_name = make_create_env(flow_env_name, flow_params, 0) # Register as rllib env registry.register_env(env_name, create_env) alg = ppo.PPOAgent(env=env_name, registry=registry.get_registry(), config=config) for i in range(1): alg.train() checkpoint_path = alg.save() self.assertTrue("%s.index" % os.path.exists(checkpoint_path)) # Test 2: test_two_level_ray # Integration test for two-level fcnet policy # FIXME(cathywu) ray restart currently not supported, so need to tie # integration tests together for the time being. # reload(ppo) # reload(registry) config = ppo.DEFAULT_CONFIG.copy() num_workers = 1 # ray.init(num_cpus=num_workers, redirect_output=True) config["num_workers"] = num_workers config["timesteps_per_batch"] = min(HORIZON * num_workers, 128) config["num_sgd_iter"] = 1 config["model"].update({"fcnet_hiddens": [3, 3]}) config["gamma"] = 0.999 config["min_steps_per_task"] = HORIZON config["horizon"] = HORIZON config["sgd_batchsize"] = 4 config["model"].update({"fcnet_hiddens": [5, 3]}, ) options = { "num_subpolicies": 2, "fn_choose_subpolicy": fn_choose_subpolicy, "hierarchical_fcnet_hiddens": [[3, 3]] * 2 } config["model"].update({"custom_options": options})
if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env, config=args.config) agent.restore(args.checkpoint) num_steps = int(args.steps) if args.run == "DQN": env = gym.make(args.env) env = wrap_dqn(get_registry(), env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(), gym.make(args.env)) if args.out is not None: rollouts = [] steps = 0 while steps < (num_steps or steps + 1): if args.out is not None: rollout = [] state = env.reset() done = False reward_total = 0.0 while not done and steps < (num_steps or steps + 1): action = agent.compute_action(state) next_state, reward, done, _ = env.step(action)
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model( get_registry(), 1, 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
config = get_rllib_config(result_dir) # Run on only one cpu for rendering purposes ray.init(num_cpus=1) config["num_workers"] = 1 flow_params = get_flow_params(config) # Create and register a gym+rllib env create_env, env_name = make_create_env(params=flow_params, version=0, sumo_binary="sumo") register_env(env_name, create_env) agent_cls = get_agent_class(args.run) agent = agent_cls(env=env_name, registry=get_registry(), config=config) checkpoint = result_dir + '/checkpoint-' + args.checkpoint_num agent._restore(checkpoint) # Recreate the scenario from the pickled parameters exp_tag = flow_params["exp_tag"] net_params = flow_params['net'] vehicles = flow_params['veh'] initial_config = flow_params['initial'] module = __import__("flow.scenarios", fromlist=[flow_params["scenario"]]) scenario_class = getattr(module, flow_params["scenario"]) module = __import__("flow.scenarios", fromlist=[flow_params["generator"]]) generator_class = getattr(module, flow_params["generator"]) scenario = scenario_class(name=exp_tag, generator_class=generator_class,
"num_subpolicies": 2, "fn_choose_subpolicy": fn_choose_subpolicy, "hierarchical_fcnet_hiddens": [[32, 32]] * 2 } config["model"].update({"custom_options": options}) flow_env_name = "TwoLoopsMergePOEnv" exp_tag = "merge_two_level_policy_example" this_file = os.path.basename(__file__)[:-3] # filename without '.py' flow_params["flowenv"] = flow_env_name flow_params["exp_tag"] = exp_tag flow_params["module"] = os.path.basename(__file__)[:-3] config['model']['custom_options'].update({ 'flowenv': flow_env_name, 'exp_tag': exp_tag, 'module': this_file }) create_env, env_name = make_create_env(flow_env_name, flow_params, version=0, exp_tag=exp_tag) # Register as rllib env register_rllib_env(env_name, create_env) alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config) for i in range(2): alg.train() if i % 20 == 0: alg.save() # save checkpoint
def __init__(self, env_creator, policy_graph, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", preprocessor_pref="rllib", sample_async=False, compress_observations=False, observation_filter="NoFilter", registry=None, env_config=None, model_config=None, policy_config=None): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an env config dict. policy_graph (class): A class implementing rllib.PolicyGraph or rllib.TFPolicyGraph. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following choices: complete_episodes: each batch will be at least batch_steps in size, and will include one or more complete episodes. truncate_episodes: each batch will be around batch_steps in size, and include transitions from one episode only. pack_episodes: each batch will be exactly batch_steps in size, and may include transitions from multiple episodes. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations returned. observation_filter (str): Name of observation filter to use. registry (tune.Registry): User-registered objects. Pass in the value from tune.registry.get_registry() if you're having trouble resolving things like custom envs. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. """ registry = registry or get_registry() env_config = env_config or {} policy_config = policy_config or {} model_config = model_config or {} assert batch_mode in [ "complete_episodes", "truncate_episodes", "pack_episodes" ] self.env_creator = env_creator self.policy_graph = policy_graph self.batch_steps = batch_steps self.batch_mode = batch_mode self.compress_observations = compress_observations self.env = env_creator(env_config) is_atari = hasattr(self.env.unwrapped, "ale") if is_atari and "custom_preprocessor" not in model_config and \ preprocessor_pref == "deepmind": self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80)) else: self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, self.env, model_config) self.vectorized = hasattr(self.env, "vector_reset") self.policy_map = {} if issubclass(policy_graph, TFPolicyGraph): with tf.Graph().as_default(): if tf_session_creator: self.sess = tf_session_creator() else: self.sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.sess.as_default(): policy = policy_graph(self.env.observation_space, self.env.action_space, registry, policy_config) else: policy = policy_graph(self.env.observation_space, self.env.action_space, registry, policy_config) self.policy_map = {"default": policy} self.obs_filter = get_filter(observation_filter, self.env.observation_space.shape) self.filters = {"obs_filter": self.obs_filter} if self.vectorized: raise NotImplementedError("Vector envs not yet supported") else: if batch_mode not in [ "pack_episodes", "truncate_episodes", "complete_episodes" ]: raise NotImplementedError("Batch mode not yet supported") pack = batch_mode == "pack_episodes" if batch_mode == "complete_episodes": batch_steps = 999999 if sample_async: self.sampler = AsyncSampler(self.env, self.policy_map["default"], self.obs_filter, batch_steps, pack=pack) self.sampler.start() else: self.sampler = SyncSampler(self.env, self.policy_map["default"], self.obs_filter, batch_steps, pack=pack)
) def create_env(env_config): pass_params_to_gym(env_name) env = gym.envs.make(env_name) return env if __name__ == '__main__': register_env(env_name, lambda env_config: create_env(env_config)) config = ppo.DEFAULT_CONFIG.copy() horizon = 10 num_cpus = 4 ray.init(num_cpus=num_cpus, redirect_output=True) config["num_workers"] = num_cpus config["timesteps_per_batch"] = 10 config["num_sgd_iter"] = 10 config["gamma"] = 0.999 config["horizon"] = horizon config["use_gae"] = False config["model"].update({"fcnet_hiddens": [256, 256]}) options = {"multiagent_obs_shapes": [2, 2], "multiagent_act_shapes": [1, 1], "multiagent_shared_model": False, "multiagent_fcnet_hiddens": [[32, 32]] * 2} config["model"].update({"custom_options": options}) alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config) for i in range(1): alg.train()
with open(config_path) as f: args.config = json.load(f) if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env) agent.restore(args.checkpoint) num_steps = int(args.steps) env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(), gym.make(args.env)) if args.out is not None: rollouts = [] steps = 0 while steps < (num_steps or steps + 1): if args.out is not None: rollout = [] state = env.reset() done = False while not done and steps < (num_steps or steps + 1): action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) if not args.no_render: env.render() if args.out is not None:
output_path = join(args.output, datetime.today().strftime('%Y-%m-%d-%H-%M-%S')) logging_path = join(output_path, 'log.txt') if not isdir(output_path): makedirs(output_path) if isfile('usernames'): remove('usernames') env_creator_name = "PokeBattleEnv-v0" register_env(env_creator_name, lambda config: PokeBattleEnv(ShowdownSimulator(self_play=False, logging_file=logging_path))) ray.init() config = ppo.DEFAULT_CONFIG.copy() config['num_workers'] = args.workers config['timesteps_per_batch'] = args.batch_steps config['horizon'] = 500 config['min_steps_per_task'] = 1 config['gamma'] = 1 config['model']['fcnet_hiddens'] = [2000, 500, 100] agent = ppo.PPOAgent(config=config, env=env_creator_name, registry=get_registry()) if args.restore is not None: agent.restore(args.restore) for i in range(args.iterations): result = agent.train() print(f"result: {result}") if i % args.save_iterations == 0: agent.save(checkpoint_dir=output_path)
def __init__(self, env_creator, policy_graph, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="rllib", sample_async=False, compress_observations=False, num_envs=1, observation_filter="NoFilter", registry=None, env_config=None, model_config=None, policy_config=None): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an env config dict. policy_graph (class): A class implementing rllib.PolicyGraph or rllib.TFPolicyGraph. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of exactly `batch_steps` in size. Episodes may be truncated in order to meet this size requirement. When `num_envs > 1`, episodes will be truncated to sequences of `batch_size / num_envs` in length. "complete_episodes": Each call to sample() will return a batch of at least `batch_steps in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations returned. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_filter (str): Name of observation filter to use. registry (tune.Registry): User-registered objects. Pass in the value from tune.registry.get_registry() if you're having trouble resolving things like custom envs. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. """ registry = registry or get_registry() env_config = env_config or {} policy_config = policy_config or {} model_config = model_config or {} self.env_creator = env_creator self.policy_graph = policy_graph self.batch_steps = batch_steps self.batch_mode = batch_mode self.compress_observations = compress_observations self.env = env_creator(env_config) if isinstance(self.env, VectorEnv) or \ isinstance(self.env, ServingEnv) or \ isinstance(self.env, AsyncVectorEnv): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ "custom_preprocessor" not in model_config and \ preprocessor_pref == "deepmind": def wrap(env): return wrap_deepmind(env, dim=model_config.get("dim", 80)) else: def wrap(env): return ModelCatalog.get_preprocessor_as_wrapper( registry, env, model_config) self.env = wrap(self.env) def make_env(): return wrap(env_creator(env_config)) self.policy_map = {} if issubclass(policy_graph, TFPolicyGraph): with tf.Graph().as_default(): if tf_session_creator: self.sess = tf_session_creator() else: self.sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.sess.as_default(): policy = policy_graph(self.env.observation_space, self.env.action_space, registry, policy_config) else: policy = policy_graph(self.env.observation_space, self.env.action_space, registry, policy_config) self.policy_map = {"default": policy} self.obs_filter = get_filter(observation_filter, self.env.observation_space.shape) self.filters = {"obs_filter": self.obs_filter} # Always use vector env for consistency even if num_envs = 1 if not isinstance(self.env, AsyncVectorEnv): if isinstance(self.env, ServingEnv): self.vector_env = _ServingEnvToAsync(self.env) else: if not isinstance(self.env, VectorEnv): self.env = VectorEnv.wrap(make_env, [self.env], num_envs=num_envs) self.vector_env = _VectorEnvToAsync(self.env) else: self.vector_env = self.env if self.batch_mode == "truncate_episodes": if batch_steps % num_envs != 0: raise ValueError( "In 'truncate_episodes' batch mode, `batch_steps` must be " "evenly divisible by `num_envs`. Got {} and {}.".format( batch_steps, num_envs)) batch_steps = batch_steps // num_envs pack_episodes = True elif self.batch_mode == "complete_episodes": batch_steps = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) if sample_async: self.sampler = AsyncSampler(self.vector_env, self.policy_map["default"], self.obs_filter, batch_steps, horizon=episode_horizon, pack=pack_episodes) self.sampler.start() else: self.sampler = SyncSampler(self.vector_env, self.policy_map["default"], self.obs_filter, batch_steps, horizon=episode_horizon, pack=pack_episodes)
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model( get_registry(), 1, 5, {"custom_model": "foo"}) assert type(p1) == CustomModel
parser.add_argument('-b', '--battles', type=int, default=1000, help='Amount of battles to test the model') args = parser.parse_args() env = PokeBattleEnv(ShowdownSimulator(self_play=False)) env_creator_name = "PokeBattleEnv-v0" register_env(env_creator_name, lambda config: env) ray.init() config = ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 1 config['timesteps_per_batch'] = 200 config['horizon'] = 500 config['min_steps_per_task'] = 1 agent = ppo.PPOAgent(config=config, env=env_creator_name, registry=get_registry()) agent.restore(args.load) for battle in range(args.battles): observation = env.reset() env.render() done = False while not done: action = agent.compute_action(observation) observation, _, done, _ = env.step(action)