コード例 #1
0
    def test_cql_compilation(self):
        """Test whether a CQLTrainer can be built with all frameworks."""

        # Learns from a historic-data file.
        # To generate this data, first run:
        # $ ./train.py --run=SAC --env=Pendulum-v0 \
        #   --stop='{"timesteps_total": 50000}' \
        #   --config='{"output": "/tmp/out"}'
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))

        config = cql.CQL_DEFAULT_CONFIG.copy()
        config["env"] = "Pendulum-v0"
        config["input"] = [data_file]

        config["num_workers"] = 0  # Run locally.
        config["twin_q"] = True
        config["clip_actions"] = False
        config["normalize_actions"] = True
        config["learning_starts"] = 0
        config["rollout_fragment_length"] = 1
        config["train_batch_size"] = 10

        num_iterations = 2

        # Test for tf framework (torch not implemented yet).
        for _ in framework_iterator(config, frameworks=("torch")):
            trainer = cql.CQLTrainer(config=config)
            for i in range(num_iterations):
                trainer.train()
            check_compute_single_action(trainer)
            trainer.stop()
コード例 #2
0
    def test_cql_compilation(self):
        """Test whether a MAMLTrainer can be built with all frameworks."""
        config = cql.CQL_DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["twin_q"] = True
        config["clip_actions"] = False
        config["normalize_actions"] = True
        config["learning_starts"] = 0
        config["rollout_fragment_length"] = 10
        config["train_batch_size"] = 10
        num_iterations = 1

        # Test for tf framework (torch not implemented yet).
        for fw in framework_iterator(config, frameworks=("torch")):
            for env in [
                    "MountainCarContinuous-v0",
            ]:
                print("env={}".format(env))
                trainer = cql.CQLTrainer(config=config, env=env)
                for i in range(num_iterations):
                    trainer.train()
                check_compute_single_action(trainer)
                trainer.stop()
コード例 #3
0
    def test_cql_compilation(self):
        """Test whether a CQLTrainer can be built with all frameworks."""

        # Learns from a historic-data file.
        # To generate this data, first run:
        # $ ./train.py --run=SAC --env=Pendulum-v0 \
        #   --stop='{"timesteps_total": 50000}' \
        #   --config='{"output": "/tmp/out"}'
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))

        config = cql.CQL_DEFAULT_CONFIG.copy()
        config["env"] = "Pendulum-v0"
        config["input"] = [data_file]

        config["num_workers"] = 0  # Run locally.
        config["twin_q"] = True
        config["clip_actions"] = False
        config["normalize_actions"] = True
        config["learning_starts"] = 0
        config["rollout_fragment_length"] = 1
        config["train_batch_size"] = 10

        # Switch on off-policy evaluation.
        config["input_evaluation"] = ["is"]

        num_iterations = 2

        # Test for tf framework (torch not implemented yet).
        for _ in framework_iterator(config, frameworks=("torch")):
            trainer = cql.CQLTrainer(config=config)
            for i in range(num_iterations):
                print(trainer.train())

            check_compute_single_action(trainer)

            # Get policy, model, and replay-buffer.
            pol = trainer.get_policy()
            cql_model = pol.model
            from ray.rllib.agents.cql.cql import replay_buffer

            # Example on how to do evaluation on the trained Trainer
            # using the data from our buffer.
            # Get a sample (MultiAgentBatch -> SampleBatch).
            batch = replay_buffer.replay().policy_batches["default_policy"]
            obs = torch.from_numpy(batch["obs"])
            # Pass the observations through our model to get the
            # features, which then to pass through the Q-head.
            model_out, _ = cql_model({"obs": obs})
            # The estimated Q-values from the (historic) actions in the batch.
            q_values_old = cql_model.get_q_values(
                model_out, torch.from_numpy(batch["actions"]))
            # The estimated Q-values for the new actions computed
            # by our trainer policy.
            actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0]
            q_values_new = cql_model.get_q_values(
                model_out, torch.from_numpy(actions_new))
            print(f"Q-val batch={q_values_old}")
            print(f"Q-val policy={q_values_new}")

            trainer.stop()
コード例 #4
0
    config["evaluation_num_workers"] = 1
    config["evaluation_interval"] = 1
    config["evaluation_duration"] = 10
    # This should be False b/c iterations are very long and this would
    # cause evaluation to lag one iter behind training.
    config["evaluation_parallel_to_training"] = False
    # Evaluate on actual environment.
    config["evaluation_config"] = {"input": "sampler"}

    # Check, whether we can learn from the given file in `num_iterations`
    # iterations, up to a reward of `min_reward`.
    num_iterations = 5
    min_reward = -300

    # Test for torch framework (tf not implemented yet).
    trainer = cql.CQLTrainer(config=config)
    learnt = False
    for i in range(num_iterations):
        print(f"Iter {i}")
        eval_results = trainer.train().get("evaluation")
        if eval_results:
            print("... R={}".format(eval_results["episode_reward_mean"]))
            # Learn until some reward is reached on an actual live env.
            if eval_results["episode_reward_mean"] >= min_reward:
                learnt = True
                break
    if not learnt:
        raise ValueError("CQLTrainer did not reach {} reward from expert "
                         "offline data!".format(min_reward))

    # Get policy, model, and replay-buffer.
コード例 #5
0
    def test_cql_compilation(self):
        """Test whether a CQLTrainer can be built with all frameworks."""

        # Learns from a historic-data file.
        # To generate this data, first run:
        # $ ./train.py --run=SAC --env=Pendulum-v1 \
        #   --stop='{"timesteps_total": 50000}' \
        #   --config='{"output": "/tmp/out"}'
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))

        config = cql.CQL_DEFAULT_CONFIG.copy()
        config["env"] = "Pendulum-v1"
        config["input"] = [data_file]

        # In the files, we use here for testing, actions have already
        # been normalized.
        # This is usually the case when the file was generated by another
        # RLlib algorithm (e.g. PPO or SAC).
        config["actions_in_input_normalized"] = False
        config["clip_actions"] = True
        config["train_batch_size"] = 2000

        config["num_workers"] = 0  # Run locally.
        config["twin_q"] = True
        config["learning_starts"] = 0
        config["bc_iters"] = 2  # 2 BC iters, 2 CQL iters.
        config["rollout_fragment_length"] = 1

        # Switch on off-policy evaluation.
        config["input_evaluation"] = ["is"]

        config["evaluation_interval"] = 2
        config["evaluation_duration"] = 10
        config["evaluation_config"]["input"] = "sampler"
        config["evaluation_parallel_to_training"] = False
        config["evaluation_num_workers"] = 2

        num_iterations = 4

        # Test for tf/torch frameworks.
        for fw in framework_iterator(config, with_eager_tracing=True):
            trainer = cql.CQLTrainer(config=config)
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)
                eval_results = results.get("evaluation")
                if eval_results:
                    print(f"iter={trainer.iteration} "
                          f"R={eval_results['episode_reward_mean']}")

            check_compute_single_action(trainer)

            # Get policy and model.
            pol = trainer.get_policy()
            cql_model = pol.model
            if fw == "tf":
                pol.get_session().__enter__()

            # Example on how to do evaluation on the trained Trainer
            # using the data from CQL's global replay buffer.
            # Get a sample (MultiAgentBatch -> SampleBatch).
            batch = trainer.local_replay_buffer.replay(
            ).policy_batches["default_policy"]

            if fw == "torch":
                obs = torch.from_numpy(batch["obs"])
            else:
                obs = batch["obs"]
                batch["actions"] = batch["actions"].astype(np.float32)

            # Pass the observations through our model to get the
            # features, which then to pass through the Q-head.
            model_out, _ = cql_model({"obs": obs})
            # The estimated Q-values from the (historic) actions in the batch.
            if fw == "torch":
                q_values_old = cql_model.get_q_values(
                    model_out, torch.from_numpy(batch["actions"]))
            else:
                q_values_old = cql_model.get_q_values(
                    tf.convert_to_tensor(model_out), batch["actions"])

            # The estimated Q-values for the new actions computed
            # by our trainer policy.
            actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0]
            if fw == "torch":
                q_values_new = cql_model.get_q_values(
                    model_out, torch.from_numpy(actions_new))
            else:
                q_values_new = cql_model.get_q_values(model_out, actions_new)

            if fw == "tf":
                q_values_old, q_values_new = pol.get_session().run(
                    [q_values_old, q_values_new])

            print(f"Q-val batch={q_values_old}")
            print(f"Q-val policy={q_values_new}")

            if fw == "tf":
                pol.get_session().__exit__(None, None, None)

            trainer.stop()