def _register_all(): from ray.rllib.agents.trainer import Trainer from ray.rllib.agents.registry import ALGORITHMS, get_trainer_class from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS for key in ( list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys()) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"] ): register_trainable(key, get_trainer_class(key)) def _see_contrib(name): """Returns dummy agent class warning algo is in contrib/.""" class _SeeContrib(Trainer): def setup(self, config): raise NameError("Please run `contrib/{}` instead.".format(name)) return _SeeContrib # Also register the aliases minus contrib/ to give a good error message. for key in list(CONTRIBUTED_ALGORITHMS.keys()): assert key.startswith("contrib/") alias = key.split("/", 1)[1] if alias not in ALGORITHMS: register_trainable(alias, _see_contrib(alias))
def _do_test_fault_fatal_but_recreate(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) agent_cls = get_trainer_class(alg) # Test raises real error when out of workers config["num_workers"] = 2 config["recreate_failed_workers"] = True # Make both worker idx=1 and 2 fail. config["env_config"] = {"bad_indices": [1, 2]} for _ in framework_iterator(config, frameworks=("tf2", "torch")): a = agent_cls(config=config, env="fault_env") # Expect this to go well and all faulty workers are recovered. self.assertTrue(not any( ray.get( worker.apply.remote(lambda w: w.recreated_worker or w. env_context.recreated_worker)) for worker in a.workers.remote_workers())) result = a.train() self.assertTrue(result["num_healthy_workers"], 2) self.assertTrue( all( ray.get( worker.apply.remote(lambda w: w.recreated_worker and w. env_context.recreated_worker)) for worker in a.workers.remote_workers())) # This should also work several times. result = a.train() self.assertTrue(result["num_healthy_workers"], 2) a.stop()
def _do_check(alg, config, a_name, o_name): fw = config["framework"] action_space = ACTION_SPACES_TO_TEST[a_name] obs_space = OBSERVATION_SPACES_TO_TEST[o_name] print( "=== Testing {} (fw={}) A={} S={} ===".format( alg, fw, action_space, obs_space ) ) config.update( dict( env_config=dict( action_space=action_space, observation_space=obs_space, reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32), p_done=1.0, check_action_bounds=check_bounds, ) ) ) stat = "ok" try: a = get_trainer_class(alg)(config=config, env=RandomEnv) except ray.exceptions.RayActorError as e: if len(e.args) >= 2 and isinstance(e.args[2], UnsupportedSpaceException): stat = "unsupported" elif isinstance(e.args[0].args[2], UnsupportedSpaceException): stat = "unsupported" else: raise except UnsupportedSpaceException: stat = "unsupported" else: if alg not in ["DDPG", "ES", "ARS", "SAC"]: # 2D (image) input: Expect VisionNet. if o_name in ["atari", "image"]: if fw == "torch": assert isinstance(a.get_policy().model, TorchVisionNet) else: assert isinstance(a.get_policy().model, VisionNet) # 1D input: Expect FCNet. elif o_name == "vector1d": if fw == "torch": assert isinstance(a.get_policy().model, TorchFCNet) else: assert isinstance(a.get_policy().model, FCNet) # Could be either one: ComplexNet (if disabled Preprocessor) # or FCNet (w/ Preprocessor). elif o_name == "vector2d": if fw == "torch": assert isinstance( a.get_policy().model, (TorchComplexNet, TorchFCNet) ) else: assert isinstance(a.get_policy().model, (ComplexNet, FCNet)) if train: a.train() a.stop() print(stat)
def test_pettingzoo_env(self): register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env())) env = PettingZooEnv(simple_spread_v2.env()) observation_space = env.observation_space action_space = env.action_space del env agent_class = get_trainer_class("PPO") config = deepcopy(agent_class.get_default_config()) config["multiagent"] = { # Set of policy IDs (by default, will use Trainer's # default policy class, the env's obs/act spaces and config={}). "policies": { "av": (None, observation_space, action_space, {}) }, # Mapping function that always returns "av" as policy ID to use # (for any agent). "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av", } config["log_level"] = "DEBUG" config["num_workers"] = 0 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 # After n steps, force reset simulation config["no_done_at_end"] = False agent = agent_class(env="simple_spread", config=config) agent.train()
def check_support(alg, config, test_eager=False, test_trace=True): config["framework"] = "tfe" config["log_level"] = "ERROR" # Test both continuous and discrete actions. for cont in [True, False]: if cont and alg in ["DQN", "APEX", "SimpleQ"]: continue elif not cont and alg in ["DDPG", "APEX_DDPG", "TD3"]: continue if cont: config["env"] = "Pendulum-v1" else: config["env"] = "CartPole-v0" a = get_trainer_class(alg) if test_eager: print("tf-eager: alg={} cont.act={}".format(alg, cont)) config["eager_tracing"] = False tune.run( a, config=config, stop={"training_iteration": 1}, verbose=1) if test_trace: config["eager_tracing"] = True print("tf-eager-tracing: alg={} cont.act={}".format(alg, cont)) tune.run( a, config=config, stop={"training_iteration": 1}, verbose=1)
def test_pettingzoo_env(self): register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env())) agent_class = get_trainer_class("PPO") config = deepcopy(agent_class._default_config) test_env = PettingZooEnv(simple_spread_v2.env()) obs_space = test_env.observation_space act_space = test_env.action_space test_env.close() config["multiagent"] = { "policies": { # the first tuple value is None -> uses default policy "av": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: "av" } config["log_level"] = "DEBUG" config["num_workers"] = 0 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 # After n steps, force reset simulation config["no_done_at_end"] = False agent = agent_class(env="simple_spread", config=config) agent.train()
def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"): """Run heuristic policies vs a learned agent. The learned agent should eventually reach a reward of ~5 with use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy can perform better is since it can distinguish between the always_same vs beat_last heuristics. """ def select_policy(agent_id): if agent_id == "player1": return "learned" else: return random.choice(["always_same", "beat_last"]) config = { "env": RockPaperScissors, "gamma": 0.9, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "num_envs_per_worker": 4, "rollout_fragment_length": 10, "train_batch_size": 200, "multiagent": { "policies_to_train": ["learned"], "policies": { "always_same": (AlwaysSameHeuristic, Discrete(3), Discrete(3), {}), "beat_last": (BeatLastHeuristic, Discrete(3), Discrete(3), {}), "learned": (None, Discrete(3), Discrete(3), { "model": { "use_lstm": use_lstm }, "framework": "torch" if args.torch else "tf", }), }, "policy_mapping_fn": select_policy, }, "framework": "torch" if args.torch else "tf", } cls = get_trainer_class(trainer) if isinstance(trainer, str) else trainer trainer_obj = cls(config=config) env = trainer_obj.workers.local_worker().env for _ in range(args.stop_iters): results = trainer_obj.train() print(results) # Timesteps reached. if results["timesteps_total"] > args.stop_timesteps: break # Reward (difference) reached -> all good, return. elif env.player1_score - env.player2_score > args.stop_reward: return # Reward (difference) not reached: Error if `as_test`. if args.as_test: raise ValueError( "Desired reward difference ({}) not reached! Only got to {}.". format(args.stop_reward, env.player1_score - env.player2_score))
def train_and_export(algo_name, num_steps, model_dir, ckpt_dir, prefix): cls = get_trainer_class(algo_name) alg = cls(config={}, env="CartPole-v0") for _ in range(num_steps): alg.train() # Export tensorflow checkpoint for fine-tuning alg.export_policy_checkpoint(ckpt_dir, filename_prefix=prefix) # Export tensorflow SavedModel for online serving alg.export_policy_model(model_dir)
def check_support_multiagent(alg, config): register_env("multi_agent_mountaincar", lambda _: MultiAgentMountainCar({"num_agents": 2})) register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 2})) config["log_level"] = "ERROR" for fw in framework_iterator(config): if fw in ["tf2", "tfe"] and \ alg in ["A3C", "APEX", "APEX_DDPG", "IMPALA"]: continue if alg in ["DDPG", "APEX_DDPG", "SAC"]: a = get_trainer_class(alg)(config=config, env="multi_agent_mountaincar") else: a = get_trainer_class(alg)(config=config, env="multi_agent_cartpole") print(a.train()) a.stop()
def _do_test_fault_fatal(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) agent_cls = get_trainer_class(alg) # Test raises real error when out of workers config["num_workers"] = 2 config["ignore_worker_failures"] = True config["env_config"] = {"bad_indices": [1, 2]} for _ in framework_iterator(config, frameworks=("torch", "tf")): a = agent_cls(config=config, env="fault_env") self.assertRaises(Exception, lambda: a.train()) a.stop()
def export_test(alg_name, failures, framework="tf"): def valid_tf_model(model_dir): return os.path.exists(os.path.join( model_dir, "saved_model.pb")) and os.listdir( os.path.join(model_dir, "variables")) def valid_tf_checkpoint(checkpoint_dir): return (os.path.exists(os.path.join(checkpoint_dir, "model.meta")) and os.path.exists(os.path.join(checkpoint_dir, "model.index")) and os.path.exists(os.path.join(checkpoint_dir, "checkpoint"))) cls = get_trainer_class(alg_name) config = CONFIGS[alg_name].copy() config["framework"] = framework if "DDPG" in alg_name or "SAC" in alg_name: algo = cls(config=config, env="Pendulum-v1") else: algo = cls(config=config, env="CartPole-v0") for _ in range(1): res = algo.train() print("current status: " + str(res)) export_dir = os.path.join(ray._private.utils.get_user_temp_dir(), "export_dir_%s" % alg_name) print("Exporting model ", alg_name, export_dir) algo.export_policy_model(export_dir) if framework == "tf" and not valid_tf_model(export_dir): failures.append(alg_name) shutil.rmtree(export_dir) if framework == "tf": print("Exporting checkpoint", alg_name, export_dir) algo.export_policy_checkpoint(export_dir) if framework == "tf" and not valid_tf_checkpoint(export_dir): failures.append(alg_name) shutil.rmtree(export_dir) print("Exporting default policy", alg_name, export_dir) algo.export_model([ExportFormat.CHECKPOINT, ExportFormat.MODEL], export_dir) if not valid_tf_model(os.path.join( export_dir, ExportFormat.MODEL)) or not valid_tf_checkpoint( os.path.join(export_dir, ExportFormat.CHECKPOINT)): failures.append(alg_name) # Test loading the exported model. model = tf.saved_model.load( os.path.join(export_dir, ExportFormat.MODEL)) assert model shutil.rmtree(export_dir) algo.stop()
def test_pettingzoo_pistonball_v6_policies_are_dict_env(self): def env_creator(config): env = pistonball_v6.env() env = dtype_v0(env, dtype=float32) env = color_reduction_v0(env, mode="R") env = normalize_obs_v0(env) return env config = deepcopy(get_trainer_class("PPO").get_default_config()) config["env_config"] = {"local_ratio": 0.5} # Register env register_env("pistonball", lambda config: PettingZooEnv(env_creator(config))) env = PettingZooEnv(env_creator(config)) observation_space = env.observation_space action_space = env.action_space del env config["multiagent"] = { # Setup a single, shared policy for all agents. "policies": { "av": (None, observation_space, action_space, {}) }, # Map all agents to that policy. "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av", } config["log_level"] = "DEBUG" config["num_workers"] = 1 # Fragment length, collected at once from each worker # and for each agent! config["rollout_fragment_length"] = 30 # Training batch size -> Fragments are concatenated up to this point. config["train_batch_size"] = 200 # After n steps, force reset simulation config["horizon"] = 200 # Default: False config["no_done_at_end"] = False trainer = get_trainer_class("PPO")(env="pistonball", config=config) trainer.train()
def _do_test_fault_recover(self, alg, config): register_env("fault_env", lambda c: FaultInjectEnv(c)) agent_cls = get_trainer_class(alg) # Test fault handling config["num_workers"] = 2 config["ignore_worker_failures"] = True config["env_config"] = {"bad_indices": [1]} for _ in framework_iterator(config, frameworks=("torch", "tf")): a = agent_cls(config=config, env="fault_env") result = a.train() self.assertTrue(result["num_healthy_workers"], 1) a.stop()
def setup(self, config): algo = config.pop("algorithm") eval_weights = config.pop("evaluation_weights", None) self.export_model_every = config.pop("export_model_every", 10) self.update_winrate = config.pop("update_self_play_param_win_rate", 0.5) self.trainer = get_trainer_class(algo)(env="yaniv", config=config) if eval_weights is not None: self.trainer.set_weights({"eval_policy": eval_weights}) self.config = config
def check_support_multiagent(alg, config): register_env("multi_agent_mountaincar", lambda _: MultiAgentMountainCar({"num_agents": 2})) register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 2})) # Simulate a simple multi-agent setup. policies = { "policy_0": PolicySpec(config={"gamma": 0.99}), "policy_1": PolicySpec(config={"gamma": 0.95}), } policy_ids = list(policies.keys()) def policy_mapping_fn(agent_id, episode, worker, **kwargs): pol_id = policy_ids[agent_id] return pol_id config["multiagent"] = { "policies": policies, "policy_mapping_fn": policy_mapping_fn, } for fw in framework_iterator(config): if fw in ["tf2", "tfe"] and \ alg in ["A3C", "APEX", "APEX_DDPG", "IMPALA"]: continue if alg in ["DDPG", "APEX_DDPG", "SAC"]: a = get_trainer_class(alg)(config=config, env="multi_agent_mountaincar") else: a = get_trainer_class(alg)(config=config, env="multi_agent_cartpole") results = a.train() check_train_results(results) print(results) a.stop()
def model_import_test(algo, config, env): # Get the abs-path to use (bazel-friendly). rllib_dir = Path(__file__).parent.parent import_file = str(rllib_dir) + "/tests/data/model_weights/weights.h5" agent_cls = get_trainer_class(algo) for fw in framework_iterator(config, ["tf", "torch"]): config["model"]["custom_model"] = "keras_model" if fw != "torch" else \ "torch_model" agent = agent_cls(config, env) def current_weight(agent): if fw == "tf": return agent.get_weights()[DEFAULT_POLICY_ID][ "default_policy/value/kernel"][0] elif fw == "torch": return float(agent.get_weights()[DEFAULT_POLICY_ID][ "value_branch.weight"][0][0]) else: return agent.get_weights()[DEFAULT_POLICY_ID][4][0] # Import weights for our custom model from an h5 file. weight_before_import = current_weight(agent) agent.import_model(import_file=import_file) weight_after_import = current_weight(agent) check(weight_before_import, weight_after_import, false=True) # Train for a while. for _ in range(1): agent.train() weight_after_train = current_weight(agent) # Weights should have changed. check(weight_before_import, weight_after_train, false=True) check(weight_after_import, weight_after_train, false=True) # We can save the entire Agent and restore, weights should remain the # same. file = agent.save("after_train") check(weight_after_train, current_weight(agent)) agent.restore(file) check(weight_after_train, current_weight(agent)) # Import (untrained) weights again. agent.import_model(import_file=import_file) check(current_weight(agent), weight_after_import)
def _do_check(alg, config, a_name, o_name): fw = config["framework"] action_space = ACTION_SPACES_TO_TEST[a_name] obs_space = OBSERVATION_SPACES_TO_TEST[o_name] print("=== Testing {} (fw={}) A={} S={} ===".format( alg, fw, action_space, obs_space)) config.update( dict( env_config=dict( action_space=action_space, observation_space=obs_space, reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32), p_done=1.0, check_action_bounds=check_bounds))) stat = "ok" try: a = get_trainer_class(alg)(config=config, env=RandomEnv) except ray.exceptions.RayActorError as e: if isinstance(e.args[2], UnsupportedSpaceException): stat = "unsupported" else: raise except UnsupportedSpaceException: stat = "unsupported" else: if alg not in ["DDPG", "ES", "ARS", "SAC"]: if o_name in ["atari", "image"]: if fw == "torch": assert isinstance(a.get_policy().model, TorchVisionNetV2) else: assert isinstance(a.get_policy().model, VisionNetV2) elif o_name in ["vector", "vector2"]: if fw == "torch": assert isinstance(a.get_policy().model, TorchFCNetV2) else: assert isinstance(a.get_policy().model, FCNetV2) if train: a.train() a.stop() print(stat)
def export_test(alg_name, failures): def valid_tf_model(model_dir): return os.path.exists(os.path.join(model_dir, "saved_model.pb")) \ and os.listdir(os.path.join(model_dir, "variables")) def valid_tf_checkpoint(checkpoint_dir): return os.path.exists(os.path.join(checkpoint_dir, "model.meta")) \ and os.path.exists(os.path.join(checkpoint_dir, "model.index")) \ and os.path.exists(os.path.join(checkpoint_dir, "checkpoint")) cls = get_trainer_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: algo = cls(config=CONFIGS[alg_name], env="Pendulum-v0") else: algo = cls(config=CONFIGS[alg_name], env="CartPole-v0") for _ in range(1): res = algo.train() print("current status: " + str(res)) export_dir = os.path.join(ray.utils.get_user_temp_dir(), "export_dir_%s" % alg_name) print("Exporting model ", alg_name, export_dir) algo.export_policy_model(export_dir) if not valid_tf_model(export_dir): failures.append(alg_name) shutil.rmtree(export_dir) print("Exporting checkpoint", alg_name, export_dir) algo.export_policy_checkpoint(export_dir) if not valid_tf_checkpoint(export_dir): failures.append(alg_name) shutil.rmtree(export_dir) print("Exporting default policy", alg_name, export_dir) algo.export_model([ExportFormat.CHECKPOINT, ExportFormat.MODEL], export_dir) if not valid_tf_model(os.path.join(export_dir, ExportFormat.MODEL)) \ or not valid_tf_checkpoint(os.path.join(export_dir, ExportFormat.CHECKPOINT)): failures.append(alg_name) shutil.rmtree(export_dir)
# Disable OPE, since the rollouts are coming from online clients. "off_policy_estimation_methods": {}, # Other settings. "train_batch_size": 256, "rollout_fragment_length": 20, # Multi-agent setup for the given env. "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, }, # DL framework to use. "framework": args.framework, } # Create the Trainer used for Policy serving. trainer = get_trainer_class(args.run)(config=config) # Attempt to restore from checkpoint if possible. checkpoint_path = CHECKPOINT_FILE.format(args.env) if not args.no_restore and os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop. count = 0 while True: # Calls to train() will block on the configured `input` in the Trainer # config above (PolicyServerInput). print(trainer.train()) if count % args.checkpoint_freq == 0:
def check_for_saved_config(args): """Check for saved configuration :args: Argparse arguments :returns: Saved config file with merged updates """ # Load configuration from checkpoint file. config_path = "" args.save_info = True config = None # If there is a checkpoint, find parameters if args.checkpoint_path: config_dir = os.path.dirname(args.checkpoint_path) config_path = os.path.join(config_dir, "params.pkl") # Try parent directory. if not os.path.exists(config_path): config_path = os.path.join(config_dir, "../params.pkl") # Load the config from pickled. if os.path.exists(config_path): with open(config_path, "rb") as f: config = cloudpickle.load(f) # If no pkl file found, require command line `--config`. else: # If no config in given checkpoint -> Error. if args.checkpoint_path: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " "its parent directory AND no `--config` given on command " "line!") # Use default config for given agent. _, config = get_trainer_class(args.model_name, return_config=True) # Make sure worker 0 has an Env. config["create_env_on_driver"] = True # Merge with `evaluation_config` (first try from command line, then from # pkl file). evaluation_config = copy.deepcopy( args.config.get("evaluation_config", config.get("evaluation_config", {}))) config = merge_dicts(config, evaluation_config) # Merge with command line `--config` settings (if not already the same # anyways). # Adds any custom arguments here config = merge_dicts(config, args.config) if not args.env_name: if not config.get("env"): parser.error("the following arguments are required: --env") args.env_name = config.get("env") # Make sure we have evaluation workers. if not config.get("evaluation_num_workers"): config["evaluation_num_workers"] = config.get("num_workers", 0) if not config.get("evaluation_num_episodes"): config["evaluation_num_episodes"] = 1 return config
"policies_to_train": ["policy_1"], }, "callbacks": YanivCallbacks, "log_level": "INFO", "evaluation_num_workers": 0, "evaluation_config": {"explore": False}, "evaluation_interval": args.eval_int, "custom_eval_function": make_eval_func(env_config, args.eval_num), # hyper params "model": { "custom_model": "yaniv_mask", "fcnet_hiddens": [512, 512], }, } resources = get_trainer_class(config["algorithm"]).default_resource_request(config) results = tune.run( YanivTrainer, resources_per_trial=resources, name=args.name, config=config, stop={"training_iteration": 1000}, checkpoint_freq=5, checkpoint_at_end=True, verbose=Verbosity.V3_TRIAL_DETAILS, callbacks=[ WandbLoggerCallback( project="rllib_yaniv", log_config=True, id=args.wandb_id,
def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"): """Run heuristic policies vs a learned agent. The learned agent should eventually reach a reward of ~5 with use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy can perform better is since it can distinguish between the always_same vs beat_last heuristics. """ def select_policy(agent_id, episode, **kwargs): if agent_id == "player_0": return "learned" else: return random.choice(["always_same", "beat_last"]) config = { "env": "RockPaperScissors", "gamma": 0.9, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_workers": 0, "num_envs_per_worker": 4, "rollout_fragment_length": 10, "train_batch_size": 200, "metrics_num_episodes_for_smoothing": 200, "multiagent": { "policies_to_train": ["learned"], "policies": { "always_same": PolicySpec(policy_class=AlwaysSameHeuristic), "beat_last": PolicySpec(policy_class=BeatLastHeuristic), "learned": PolicySpec(config={ "model": { "use_lstm": use_lstm }, "framework": args.framework, }), }, "policy_mapping_fn": select_policy, }, "framework": args.framework, } cls = get_trainer_class(trainer) if isinstance(trainer, str) else trainer trainer_obj = cls(config=config) for _ in range(args.stop_iters): results = trainer_obj.train() # Timesteps reached. if "policy_always_same_reward" not in results["hist_stats"]: reward_diff = 0 continue reward_diff = sum(results["hist_stats"]["policy_learned_reward"]) if results["timesteps_total"] > args.stop_timesteps: break # Reward (difference) reached -> all good, return. elif reward_diff > args.stop_reward: return # Reward (difference) not reached: Error if `as_test`. if args.as_test: raise ValueError( "Desired reward difference ({}) not reached! Only got to {}.". format(args.stop_reward, reward_diff))
def run(args, parser): # Load configuration from checkpoint file. config_path = "" if args.checkpoint: config_dir = os.path.dirname(args.checkpoint) config_path = os.path.join(config_dir, "params.pkl") # Try parent directory. if not os.path.exists(config_path): config_path = os.path.join(config_dir, "../params.pkl") # Load the config from pickled. if os.path.exists(config_path): with open(config_path, "rb") as f: config = cloudpickle.load(f) # If no pkl file found, require command line `--config`. else: # If no config in given checkpoint -> Error. if args.checkpoint: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " "its parent directory AND no `--config` given on command " "line!") # Use default config for given agent. _, config = get_trainer_class(args.run, return_config=True) # Make sure worker 0 has an Env. config["create_env_on_driver"] = True # Merge with `evaluation_config` (first try from command line, then from # pkl file). evaluation_config = copy.deepcopy( args.config.get("evaluation_config", config.get("evaluation_config", {}))) config = merge_dicts(config, evaluation_config) # Merge with command line `--config` settings (if not already the same # anyways). config = merge_dicts(config, args.config) if not args.env: if not config.get("env"): parser.error("the following arguments are required: --env") args.env = config.get("env") # Make sure we have evaluation workers. if not config.get("evaluation_num_workers"): config["evaluation_num_workers"] = config.get("num_workers", 0) if not config.get("evaluation_num_episodes"): config["evaluation_num_episodes"] = 1 config["render_env"] = not args.no_render config["record_env"] = args.video_dir ray.init(local_mode=args.local_mode) # Create the Trainer from config. cls = get_trainable_cls(args.run) agent = cls(env=args.env, config=config) # Load state from checkpoint, if provided. if args.checkpoint: agent.restore(args.checkpoint) num_steps = int(args.steps) num_episodes = int(args.episodes) # Determine the video output directory. video_dir = None # Allow user to specify a video output path. if args.video_dir: video_dir = os.path.expanduser(args.video_dir) # Do the actual rollout. with RolloutSaver(args.out, args.use_shelve, write_update_file=args.track_progress, target_steps=num_steps, target_episodes=num_episodes, save_info=args.save_info) as saver: rollout(agent, args.env, num_steps, num_episodes, saver, args.no_render, video_dir) agent.stop()
""" alg_name = "PPO" # Function that outputs the environment you wish to register. def env_creator(config): env = pistonball_v4.env(local_ratio=config.get("local_ratio", 0.2)) env = dtype_v0(env, dtype=float32) env = color_reduction_v0(env, mode="R") env = normalize_obs_v0(env) return env num_cpus = 1 num_rollouts = 2 # Gets default training configuration and specifies the POMgame to load. config = deepcopy(get_trainer_class(alg_name)._default_config) # Set environment config. This will be passed to # the env_creator function via the register env lambda below. config["env_config"] = {"local_ratio": 0.5} # Register env register_env("pistonball", lambda config: PettingZooEnv(env_creator(config))) # Configuration for multiagent setup with policy sharing: config["multiagent"] = { # Setup a single, shared policy for all agents. "policies": {"av"}, # Map all agents to that policy. "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av",
def ckpt_restore_test(alg_name, tfe=False, object_store=False): config = CONFIGS[alg_name] frameworks = (["tfe"] if tfe else []) + ["torch", "tf"] for fw in framework_iterator(config, frameworks=frameworks): for use_object_store in ([False, True] if object_store else [False]): print("use_object_store={}".format(use_object_store)) cls = get_trainer_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: alg1 = cls(config=config, env="Pendulum-v0") alg2 = cls(config=config, env="Pendulum-v0") else: alg1 = cls(config=config, env="CartPole-v0") alg2 = cls(config=config, env="CartPole-v0") policy1 = alg1.get_policy() for _ in range(1): res = alg1.train() print("current status: " + str(res)) # Check optimizer state as well. optim_state = policy1.get_state().get("_optimizer_variables") # Sync the models if use_object_store: alg2.restore_from_object(alg1.save_to_object()) else: alg2.restore(alg1.save()) # Compare optimizer state with re-loaded one. if optim_state: s2 = alg2.get_policy().get_state().get("_optimizer_variables") # Tf -> Compare states 1:1. if fw in ["tf2", "tf", "tfe"]: check(s2, optim_state) # For torch, optimizers have state_dicts with keys=params, # which are different for the two models (ignore these # different keys, but compare all values nevertheless). else: for i, s2_ in enumerate(s2): check( list(s2_["state"].values()), list(optim_state[i]["state"].values())) for _ in range(1): if "DDPG" in alg_name or "SAC" in alg_name: obs = np.clip( np.random.uniform(size=3), policy1.observation_space.low, policy1.observation_space.high) else: obs = np.clip( np.random.uniform(size=4), policy1.observation_space.low, policy1.observation_space.high) a1 = get_mean_action(alg1, obs) a2 = get_mean_action(alg2, obs) print("Checking computed actions", alg1, obs, a1, a2) if abs(a1 - a2) > .1: raise AssertionError("algo={} [a1={} a2={}]".format( alg_name, a1, a2)) # Stop both Trainers. alg1.stop() alg2.stop()
def load_agent_config(args): # Load configuration from checkpoint file. config_path = "" if args.checkpoint: config_dir = os.path.dirname(args.checkpoint) config_path = os.path.join(config_dir, "params.pkl") # Try parent directory. if not os.path.exists(config_path): config_path = os.path.join(config_dir, "../params.pkl") # Load the config from pickled. if os.path.exists(config_path): with open(config_path, "rb") as f: config = cloudpickle.load(f) # If no pkl file found, require command line `--config`. else: # If no config in given checkpoint -> Error. if args.checkpoint: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " "its parent directory AND no `--config` given on command " "line!") # Use default config for given agent. _, config = get_trainer_class(args.run, return_config=True) # Make sure worker 0 has an Env. config["num_workers"] = 0 config["num_envs_per_worker"] = 1 config["create_env_on_driver"] = True # Merge with `evaluation_config` (first try from command line, then from # pkl file). evaluation_config = copy.deepcopy( args.config.get("evaluation_config", config.get("evaluation_config", {}))) config = merge_dicts(config, evaluation_config) # Merge with command line `--config` settings (if not already the same # anyways). config = merge_dicts(config, args.config) if not args.env: args.env = config.get("env") # Make sure we have evaluation workers. # if not config.get("evaluation_num_workers"): # config["evaluation_num_workers"] = config.get("num_workers", 0) if not config.get("evaluation_num_episodes"): config["evaluation_num_episodes"] = 1 config["render_env"] = args.render config["record_env"] = args.video_dir if config.get("env_config") is None: config["env_config"] = {} print(args.agent_speeds) config["env_config"]["agent_speeds"] = args.agent_speeds register_env(args.env, env_creator) # Create the Trainer from config. cls = get_trainable_cls(args.run) agent = cls(env=args.env, config=config) # Load state from checkpoint, if provided. if args.checkpoint: agent.restore(args.checkpoint) return agent, config
"train_batch_size": 4000, "model": { "use_lstm": args.use_lstm }, }) checkpoint_path = CHECKPOINT_FILE.format(args.run) # Attempt to restore from checkpoint, if possible. if not args.no_restore and os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() else: checkpoint_path = None # Manual training loop (no Ray tune). if args.no_tune: trainer_cls = get_trainer_class(args.run) trainer = trainer_cls(config=config) if checkpoint_path: print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop. ts = 0 for _ in range(args.stop_iters): results = trainer.train() print(pretty_print(results)) checkpoint = trainer.save() print("Last checkpoint", checkpoint) with open(checkpoint_path, "w") as f: f.write(checkpoint)
experiment["config"]["eager_tracing"] = True # experiment["config"]["callbacks"] = MemoryTrackingCallbacks # Move "env" specifier into config. experiment["config"]["env"] = experiment["env"] experiment.pop("env", None) # Print out the actual config. print("== Test config ==") print(yaml.dump(experiment)) # Construct the trainer instance based on the given config. leaking = True try: ray.init(num_cpus=5, local_mode=args.local_mode) trainer = get_trainer_class(experiment["run"])( experiment["config"]) results = check_memory_leaks( trainer, to_check=set(args.to_check), ) if not results: leaking = False finally: ray.shutdown() _register_all() if not leaking: print("Memory leak test PASSED") else: print("Memory leak test FAILED. Exiting with Error.") sys.exit(1)
ray.init(num_cpus=5) # TRAIN results = tune.run("RNNSAC", **config) # TEST best_checkpoint = results.best_checkpoint print("Loading checkpoint: {}".format(best_checkpoint)) checkpoint_config_path = str( Path(best_checkpoint).parent.parent / "params.json") with open(checkpoint_config_path, "rb") as f: checkpoint_config = json.load(f) checkpoint_config["explore"] = False agent = get_trainer_class("RNNSAC")(env=config["config"]["env"], config=checkpoint_config) agent.restore(best_checkpoint) env = agent.env_creator({}) state = agent.get_policy().get_initial_state() prev_action = 0 prev_reward = 0 obs = env.reset() eps = 0 ep_reward = 0 while eps < 10: action, state, info_trainer = agent.compute_action( obs, state=state, prev_action=prev_action,
# ppo # "algorithm": "PPO", # "sgd_minibatch_size": 2048, # "train_batch_size": 65536, # "rollout_fragment_length": 100 # dqn "algorithm": "APEX", "train_batch_size": 4096, "dueling": False, "hiddens": [], "" # "double_q": False, } trainer_class = get_trainer_class(config["algorithm"]) # config.pop("algorithm") # trainer = trainer_class(env="yaniv", config=config) # trainer.train() resources = trainer_class.default_resource_request(config) results = tune.run( YanivTrainer, resources_per_trial=resources, name=args.name, config=config, stop={"training_iteration": 20000}, checkpoint_freq=5, checkpoint_at_end=True, verbose=Verbosity.V3_TRIAL_DETAILS,