def test_cql_compilation(self): """Test whether a CQLTrainer can be built with all frameworks.""" # Learns from a historic-data file. # To generate this data, first run: # $ ./train.py --run=SAC --env=Pendulum-v0 \ # --stop='{"timesteps_total": 50000}' \ # --config='{"output": "/tmp/out"}' rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = cql.CQL_DEFAULT_CONFIG.copy() config["env"] = "Pendulum-v0" config["input"] = [data_file] config["num_workers"] = 0 # Run locally. config["twin_q"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["rollout_fragment_length"] = 1 config["train_batch_size"] = 10 num_iterations = 2 # Test for tf framework (torch not implemented yet). for _ in framework_iterator(config, frameworks=("torch")): trainer = cql.CQLTrainer(config=config) for i in range(num_iterations): trainer.train() check_compute_single_action(trainer) trainer.stop()
def test_cql_compilation(self): """Test whether a MAMLTrainer can be built with all frameworks.""" config = cql.CQL_DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["twin_q"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["rollout_fragment_length"] = 10 config["train_batch_size"] = 10 num_iterations = 1 # Test for tf framework (torch not implemented yet). for fw in framework_iterator(config, frameworks=("torch")): for env in [ "MountainCarContinuous-v0", ]: print("env={}".format(env)) trainer = cql.CQLTrainer(config=config, env=env) for i in range(num_iterations): trainer.train() check_compute_single_action(trainer) trainer.stop()
def test_cql_compilation(self): """Test whether a CQLTrainer can be built with all frameworks.""" # Learns from a historic-data file. # To generate this data, first run: # $ ./train.py --run=SAC --env=Pendulum-v0 \ # --stop='{"timesteps_total": 50000}' \ # --config='{"output": "/tmp/out"}' rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = cql.CQL_DEFAULT_CONFIG.copy() config["env"] = "Pendulum-v0" config["input"] = [data_file] config["num_workers"] = 0 # Run locally. config["twin_q"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["rollout_fragment_length"] = 1 config["train_batch_size"] = 10 # Switch on off-policy evaluation. config["input_evaluation"] = ["is"] num_iterations = 2 # Test for tf framework (torch not implemented yet). for _ in framework_iterator(config, frameworks=("torch")): trainer = cql.CQLTrainer(config=config) for i in range(num_iterations): print(trainer.train()) check_compute_single_action(trainer) # Get policy, model, and replay-buffer. pol = trainer.get_policy() cql_model = pol.model from ray.rllib.agents.cql.cql import replay_buffer # Example on how to do evaluation on the trained Trainer # using the data from our buffer. # Get a sample (MultiAgentBatch -> SampleBatch). batch = replay_buffer.replay().policy_batches["default_policy"] obs = torch.from_numpy(batch["obs"]) # Pass the observations through our model to get the # features, which then to pass through the Q-head. model_out, _ = cql_model({"obs": obs}) # The estimated Q-values from the (historic) actions in the batch. q_values_old = cql_model.get_q_values( model_out, torch.from_numpy(batch["actions"])) # The estimated Q-values for the new actions computed # by our trainer policy. actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] q_values_new = cql_model.get_q_values( model_out, torch.from_numpy(actions_new)) print(f"Q-val batch={q_values_old}") print(f"Q-val policy={q_values_new}") trainer.stop()
config["evaluation_num_workers"] = 1 config["evaluation_interval"] = 1 config["evaluation_duration"] = 10 # This should be False b/c iterations are very long and this would # cause evaluation to lag one iter behind training. config["evaluation_parallel_to_training"] = False # Evaluate on actual environment. config["evaluation_config"] = {"input": "sampler"} # Check, whether we can learn from the given file in `num_iterations` # iterations, up to a reward of `min_reward`. num_iterations = 5 min_reward = -300 # Test for torch framework (tf not implemented yet). trainer = cql.CQLTrainer(config=config) learnt = False for i in range(num_iterations): print(f"Iter {i}") eval_results = trainer.train().get("evaluation") if eval_results: print("... R={}".format(eval_results["episode_reward_mean"])) # Learn until some reward is reached on an actual live env. if eval_results["episode_reward_mean"] >= min_reward: learnt = True break if not learnt: raise ValueError("CQLTrainer did not reach {} reward from expert " "offline data!".format(min_reward)) # Get policy, model, and replay-buffer.
def test_cql_compilation(self): """Test whether a CQLTrainer can be built with all frameworks.""" # Learns from a historic-data file. # To generate this data, first run: # $ ./train.py --run=SAC --env=Pendulum-v1 \ # --stop='{"timesteps_total": 50000}' \ # --config='{"output": "/tmp/out"}' rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = cql.CQL_DEFAULT_CONFIG.copy() config["env"] = "Pendulum-v1" config["input"] = [data_file] # In the files, we use here for testing, actions have already # been normalized. # This is usually the case when the file was generated by another # RLlib algorithm (e.g. PPO or SAC). config["actions_in_input_normalized"] = False config["clip_actions"] = True config["train_batch_size"] = 2000 config["num_workers"] = 0 # Run locally. config["twin_q"] = True config["learning_starts"] = 0 config["bc_iters"] = 2 # 2 BC iters, 2 CQL iters. config["rollout_fragment_length"] = 1 # Switch on off-policy evaluation. config["input_evaluation"] = ["is"] config["evaluation_interval"] = 2 config["evaluation_duration"] = 10 config["evaluation_config"]["input"] = "sampler" config["evaluation_parallel_to_training"] = False config["evaluation_num_workers"] = 2 num_iterations = 4 # Test for tf/torch frameworks. for fw in framework_iterator(config, with_eager_tracing=True): trainer = cql.CQLTrainer(config=config) for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) eval_results = results.get("evaluation") if eval_results: print(f"iter={trainer.iteration} " f"R={eval_results['episode_reward_mean']}") check_compute_single_action(trainer) # Get policy and model. pol = trainer.get_policy() cql_model = pol.model if fw == "tf": pol.get_session().__enter__() # Example on how to do evaluation on the trained Trainer # using the data from CQL's global replay buffer. # Get a sample (MultiAgentBatch -> SampleBatch). batch = trainer.local_replay_buffer.replay( ).policy_batches["default_policy"] if fw == "torch": obs = torch.from_numpy(batch["obs"]) else: obs = batch["obs"] batch["actions"] = batch["actions"].astype(np.float32) # Pass the observations through our model to get the # features, which then to pass through the Q-head. model_out, _ = cql_model({"obs": obs}) # The estimated Q-values from the (historic) actions in the batch. if fw == "torch": q_values_old = cql_model.get_q_values( model_out, torch.from_numpy(batch["actions"])) else: q_values_old = cql_model.get_q_values( tf.convert_to_tensor(model_out), batch["actions"]) # The estimated Q-values for the new actions computed # by our trainer policy. actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] if fw == "torch": q_values_new = cql_model.get_q_values( model_out, torch.from_numpy(actions_new)) else: q_values_new = cql_model.get_q_values(model_out, actions_new) if fw == "tf": q_values_old, q_values_new = pol.get_session().run( [q_values_old, q_values_new]) print(f"Q-val batch={q_values_old}") print(f"Q-val policy={q_values_new}") if fw == "tf": pol.get_session().__exit__(None, None, None) trainer.stop()