Esempio n. 1
0
def experiment(config):
    iterations = config.pop("train-iterations")
    train_agent = ppo.PPO(config=config, env="CartPole-v0")
    checkpoint = None
    train_results = {}

    # Train
    for i in range(iterations):
        train_results = train_agent.train()
        if i % 2 == 0 or i == iterations - 1:
            checkpoint = train_agent.save(tune.get_trial_dir())
        tune.report(**train_results)
    train_agent.stop()

    # Manual Eval
    config["num_workers"] = 0
    eval_agent = ppo.PPO(config=config, env="CartPole-v0")
    eval_agent.restore(checkpoint)
    env = eval_agent.workers.local_worker().env

    obs = env.reset()
    done = False
    eval_results = {"eval_reward": 0, "eval_eps_length": 0}
    while not done:
        action = eval_agent.compute_single_action(obs)
        next_obs, reward, done, info = env.step(action)
        eval_results["eval_reward"] += reward
        eval_results["eval_eps_length"] += 1
    results = {**train_results, **eval_results}
    tune.report(results)
Esempio n. 2
0
 def test_ppo_legacy_config(self):
     """Tests, whether the old PPO config dict is still functional."""
     ppo_config = ppo.DEFAULT_CONFIG
     # Expect warning.
     print(f"Accessing learning-rate from legacy config dict: {ppo_config['lr']}")
     # Build Algorithm.
     ppo_trainer = ppo.PPO(config=ppo_config, env="CartPole-v1")
     print(ppo_trainer.train())
def test_dont_import_torch_error():
    """Check that an error is thrown when torch isn't installed
    but we try to run a torch experiment.
    """
    # Do not import tf for testing purposes.
    os.environ["RLLIB_TEST_NO_TORCH_IMPORT"] = "1"
    config = {"framework": "torch"}
    with pytest.raises(ImportError,
                       match="However, there was no installation found."):
        ppo.PPO(config, env="CartPole-v1")
Esempio n. 4
0
    def test_curiosity_on_frozen_lake(self):
        config = ppo.DEFAULT_CONFIG.copy()
        # A very large frozen-lake that's hard for a random policy to solve
        # due to 0.0 feedback.
        config["env"] = "FrozenLake-v1"
        config["env_config"] = {
            "desc": [
                "SFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFG",
            ],
            "is_slippery":
            False,
        }
        # Print out observations to see how far we already get inside the Env.
        config["callbacks"] = MyCallBack
        # Limit horizon to make it really hard for non-curious agent to reach
        # the goal state.
        config["horizon"] = 16
        # Local only.
        config["num_workers"] = 0
        config["lr"] = 0.001

        num_iterations = 10
        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            # W/ Curiosity. Expect to learn something.
            config["exploration_config"] = {
                "type": "Curiosity",
                "eta": 0.2,
                "lr": 0.001,
                "feature_dim": 128,
                "feature_net_config": {
                    "fcnet_hiddens": [],
                    "fcnet_activation": "relu",
                },
                "sub_exploration": {
                    "type": "StochasticSampling",
                },
            }
            algo = ppo.PPO(config=config)
            learnt = False
            for i in range(num_iterations):
                result = algo.train()
                print(result)
                if result["episode_reward_max"] > 0.0:
                    print("Reached goal after {} iters!".format(i))
                    learnt = True
                    break
            algo.stop()
            self.assertTrue(learnt)
Esempio n. 5
0
    def test_ppo_free_log_std(self):
        """Tests the free log std option works."""
        config = (
            ppo.PPOConfig()
            .rollouts(
                num_rollout_workers=0,
            )
            .training(
                gamma=0.99,
                model=dict(
                    fcnet_hiddens=[10],
                    fcnet_activation="linear",
                    free_log_std=True,
                    vf_share_layers=True,
                ),
            )
        )

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPO(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check the free log std var is created.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters() if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables() if "log_std" in str(v)
                ]
            assert len(matching) == 1, matching
            log_std_var = matching[0]

            def get_value():
                if fw == "tf":
                    return policy.get_session().run(log_std_var)[0]
                elif fw == "torch":
                    return log_std_var.detach().cpu().numpy()[0]
                else:
                    return log_std_var.numpy()[0]

            # Check the variable is initially zero.
            init_std = get_value()
            assert init_std == 0.0, init_std
            batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy())
            if fw == "torch":
                batch = policy._lazy_tensor_dict(batch)
            policy.learn_on_batch(batch)

            # Check the variable is updated.
            post_std = get_value()
            assert post_std != 0.0, post_std
            trainer.stop()
def test_dont_import_tf_error():
    """Check that an error is thrown when tf isn't installed
    but we try to run a tf experiment.
    """
    # Do not import tf for testing purposes.
    os.environ["RLLIB_TEST_NO_TF_IMPORT"] = "1"

    config = {}
    for _ in framework_iterator(config, frameworks=("tf", "tf2", "tfe")):
        with pytest.raises(ImportError,
                           match="However, there was no installation found."):
            ppo.PPO(config, env="CartPole-v1")
Esempio n. 7
0
    def test_traj_view_lstm_prev_actions_and_rewards(self):
        """Tests, whether Policy/Model return correct LSTM ViewRequirements."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["model"] = config["model"].copy()
        # Activate LSTM + prev-action + rewards.
        config["model"]["use_lstm"] = True
        config["model"]["lstm_use_prev_action"] = True
        config["model"]["lstm_use_prev_reward"] = True

        for _ in framework_iterator(config):
            trainer = ppo.PPO(config, env="CartPole-v0")
            policy = trainer.get_policy()
            view_req_model = policy.model.view_requirements
            view_req_policy = policy.view_requirements
            # 7=obs, prev-a + r, 2x state-in, 2x state-out.
            assert len(view_req_model) == 7, view_req_model
            assert len(view_req_policy) == 20, (len(view_req_policy),
                                                view_req_policy)
            for key in [
                    SampleBatch.OBS,
                    SampleBatch.ACTIONS,
                    SampleBatch.REWARDS,
                    SampleBatch.DONES,
                    SampleBatch.NEXT_OBS,
                    SampleBatch.VF_PREDS,
                    SampleBatch.PREV_ACTIONS,
                    SampleBatch.PREV_REWARDS,
                    "advantages",
                    "value_targets",
                    SampleBatch.ACTION_DIST_INPUTS,
                    SampleBatch.ACTION_LOGP,
            ]:
                assert key in view_req_policy

                if key == SampleBatch.PREV_ACTIONS:
                    assert view_req_policy[key].data_col == SampleBatch.ACTIONS
                    assert view_req_policy[key].shift == -1
                elif key == SampleBatch.PREV_REWARDS:
                    assert view_req_policy[key].data_col == SampleBatch.REWARDS
                    assert view_req_policy[key].shift == -1
                elif key not in [
                        SampleBatch.NEXT_OBS,
                        SampleBatch.PREV_ACTIONS,
                        SampleBatch.PREV_REWARDS,
                ]:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
                    assert view_req_policy[key].shift == 1
            trainer.stop()
Esempio n. 8
0
 def test_leaky_env(self):
     """Tests, whether our diagnostics tools can detect leaks in an env."""
     config = ppo.DEFAULT_CONFIG.copy()
     # Make sure we have an env to test on the local worker.
     # Otherwise, `check_memory_leaks` will complain.
     config["create_env_on_driver"] = True
     config["env"] = MemoryLeakingEnv
     config["env_config"] = {
         "static_samples": True,
     }
     trainer = ppo.PPO(config=config)
     results = check_memory_leaks(trainer, to_check={"env"}, repeats=150)
     assert results["env"]
     trainer.stop()
Esempio n. 9
0
    def test_ppo_exploration_setup(self):
        """Tests, whether PPO runs with different exploration setups."""
        config = (
            ppo.PPOConfig()
            .environment(
                env_config={"is_slippery": False, "map_name": "4x4"},
            )
            .rollouts(
                # Run locally.
                num_rollout_workers=0,
            )
        )
        obs = np.array(0)

        # Test against all frameworks.
        for fw in framework_iterator(config):
            # Default Agent should be setup with StochasticSampling.
            trainer = ppo.PPO(config=config, env="FrozenLake-v1")
            # explore=False, always expect the same (deterministic) action.
            a_ = trainer.compute_single_action(
                obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0)
            )
            # Test whether this is really the argmax action over the logits.
            if fw != "tf":
                last_out = trainer.get_policy().model.last_output()
                if fw == "torch":
                    check(a_, np.argmax(last_out.detach().cpu().numpy(), 1)[0])
                else:
                    check(a_, np.argmax(last_out.numpy(), 1)[0])
            for _ in range(50):
                a = trainer.compute_single_action(
                    obs,
                    explore=False,
                    prev_action=np.array(2),
                    prev_reward=np.array(1.0),
                )
                check(a, a_)

            # With explore=True (default), expect stochastic actions.
            actions = []
            for _ in range(300):
                actions.append(
                    trainer.compute_single_action(
                        obs, prev_action=np.array(2), prev_reward=np.array(1.0)
                    )
                )
            check(np.mean(actions), 1.5, atol=0.2)
            trainer.stop()
Esempio n. 10
0
 def test_modelv3(self):
     config = {
         "env": "CartPole-v0",
         "model": {
             "custom_model": RNNModel,
             "custom_model_config": {
                 "hiddens_size": 64,
                 "cell_size": 128,
             },
         },
         "num_workers": 0,
     }
     trainer = ppo.PPO(config=config)
     for _ in range(2):
         results = trainer.train()
         print(results)
Esempio n. 11
0
    def test_preprocessing_disabled(self):
        config = ppo.DEFAULT_CONFIG.copy()
        config["seed"] = 42
        config["env"] = "ray.rllib.examples.env.random_env.RandomEnv"
        config["env_config"] = {
            "config": {
                "observation_space":
                Dict({
                    "a":
                    Discrete(5),
                    "b":
                    Dict({
                        "ba": Discrete(4),
                        "bb": Box(-1.0, 1.0, (2, 3), dtype=np.float32),
                    }),
                    "c":
                    Tuple((MultiDiscrete([2, 3]), Discrete(1))),
                    "d":
                    Box(-1.0, 1.0, (1, ), dtype=np.int32),
                }),
            },
        }
        # Set this to True to enforce no preprocessors being used.
        # Complex observations now arrive directly in the model as
        # structures of batches, e.g. {"a": tensor, "b": [tensor, tensor]}
        # for obs-space=Dict(a=..., b=Tuple(..., ...)).
        config["_disable_preprocessor_api"] = True
        # Speed things up a little.
        config["train_batch_size"] = 100
        config["sgd_minibatch_size"] = 10
        config["rollout_fragment_length"] = 5
        config["num_sgd_iter"] = 1

        num_iterations = 1
        # Only supported for tf so far.
        for _ in framework_iterator(config):
            algo = ppo.PPO(config=config)
            for i in range(num_iterations):
                results = algo.train()
                check_train_results(results)
                print(results)
            check_compute_single_action(algo)
            algo.stop()
Esempio n. 12
0
    def test_counting_by_agent_steps(self):
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)

        num_agents = 3

        config["num_workers"] = 2
        config["num_sgd_iter"] = 2
        config["framework"] = "torch"
        config["rollout_fragment_length"] = 21
        config["train_batch_size"] = 147
        config["multiagent"] = {
            "policies": {f"p{i}"
                         for i in range(num_agents)},
            "policy_mapping_fn": lambda aid, **kwargs: "p{}".format(aid),
            "count_steps_by": "agent_steps",
        }
        # Env setup.
        config["env"] = MultiAgentPendulum
        config["env_config"] = {"num_agents": num_agents}

        num_iterations = 2
        trainer = ppo.PPO(config=config)
        results = None
        for i in range(num_iterations):
            results = trainer.train()
        self.assertEqual(results["agent_timesteps_total"],
                         results["timesteps_total"])
        self.assertEqual(
            results["num_env_steps_trained"] * num_agents,
            results["num_agent_steps_trained"],
        )
        self.assertGreaterEqual(
            results["agent_timesteps_total"],
            num_iterations * config["train_batch_size"],
        )
        self.assertLessEqual(
            results["agent_timesteps_total"],
            (num_iterations + 1) * config["train_batch_size"],
        )
        trainer.stop()
Esempio n. 13
0
    def test_traj_view_attention_net(self):
        config = ppo.DEFAULT_CONFIG.copy()
        # Setup attention net.
        config["model"] = config["model"].copy()
        config["model"]["max_seq_len"] = 50
        config["model"]["custom_model"] = GTrXLNet
        config["model"]["custom_model_config"] = {
            "num_transformer_units": 1,
            "attention_dim": 64,
            "num_heads": 2,
            "memory_inference": 50,
            "memory_training": 50,
            "head_dim": 32,
            "ff_hidden_dim": 32,
        }
        # Test with odd batch numbers.
        config["train_batch_size"] = 1031
        config["sgd_minibatch_size"] = 201
        config["num_sgd_iter"] = 5
        config["num_workers"] = 0
        config["callbacks"] = MyCallbacks
        config["env_config"] = {
            "config": {
                "start_at_t": 1
            }
        }  # first obs is [1.0]

        for _ in framework_iterator(config, frameworks="tf2"):
            trainer = ppo.PPO(
                config,
                env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv",
            )
            rw = trainer.workers.local_worker()
            sample = rw.sample()
            assert sample.count == trainer.config["rollout_fragment_length"]
            results = trainer.train()
            assert results["timesteps_total"] == config["train_batch_size"]
            trainer.stop()
        }

    # use stop conditions passed via CLI (or defaults)
    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }

    # manual training loop using PPO without tune.run()
    if args.no_tune:
        if args.run != "PPO":
            raise ValueError("Only support --run PPO with --no-tune.")
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update(config)
        algo = ppo.PPO(config=ppo_config, env=CorrelatedActionsEnv)
        # run manual training loop and print results after each iteration
        for _ in range(args.stop_iters):
            result = algo.train()
            print(pretty_print(result))
            # stop training if the target train steps or reward are reached
            if (result["timesteps_total"] >= args.stop_timesteps
                    or result["episode_reward_mean"] >= args.stop_reward):
                break

        # run manual test loop: 1 iteration until done
        print("Finished training. Running manual test/inference loop.")
        env = CorrelatedActionsEnv(_)
        obs = env.reset()
        done = False
        total_reward = 0
Esempio n. 15
0
        self.action_space = gym.spaces.Discrete(2)  # right/left
        self.observation_space = gym.spaces.Discrete(self.end_pos)

    def reset(self):
        self.cur_pos = 0
        return self.cur_pos

    def step(self, action):
        if action == 0 and self.cur_pos > 0:  # move right (towards goal)
            self.cur_pos -= 1
        elif action == 1:  # move left (towards start)
            self.cur_pos += 1
        if self.cur_pos >= self.end_pos:
            return 0, 1.0, True, {}
        else:
            return self.cur_pos, -0.1, False, {}


ray.init()
config = {
    "env": SimpleCorridor,
    "env_config": {
        "corridor_length": 5,
    },
}

algo = ppo.PPO(config=config)
for _ in range(3):
    print(algo.train())
# __rllib-custom-gym-env-end__
Esempio n. 16
0
config["framework"] = "tf"

outdir = "export_tf"
if os.path.exists(outdir):
    shutil.rmtree(outdir)

np.random.seed(1234)

# We will run inference with this test batch
test_data = {
    "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32),
}

# Start Ray and initialize a PPO Algorithm.
ray.init()
algo = ppo.PPO(config=config, env="CartPole-v0")

# You could train the model here
# algo.train()

# Let's run inference on the tensorflow model
policy = algo.get_policy()
result_tf, _ = policy.model(test_data)

# Evaluate tensor to fetch numpy array
with policy._sess.as_default():
    result_tf = result_tf.eval()

# This line will export the model to ONNX
res = algo.export_policy_model(outdir, onnx=11)
Esempio n. 17
0
    def value_function(self):
        return torch.from_numpy(np.zeros(shape=(self._last_batch_size, )))


if __name__ == "__main__":
    ray.init()

    # Register the above custom model.
    ModelCatalog.register_custom_model("my_torch_model", MyCustomModel)

    # Create the Trainer.
    trainer = ppo.PPO(
        env="CartPole-v0",
        config={
            "framework": "torch",
            "model": {
                # Auto-wrap the custom(!) model with an LSTM.
                "use_lstm": True,
                # To further customize the LSTM auto-wrapper.
                "lstm_cell_size": 64,
                # Specify our custom model from above.
                "custom_model": "my_torch_model",
                # Extra kwargs to be passed to your model's c'tor.
                "custom_model_config": {},
            },
        },
    )
    trainer.train()

# __sphinx_doc_end__
Esempio n. 18
0
        self.action_space = gym.spaces.Discrete(2)  # right/left
        self.observation_space = gym.spaces.Discrete(self.end_pos)

    def reset(self):
        self.cur_pos = 0
        return self.cur_pos

    def step(self, action):
        if action == 0 and self.cur_pos > 0:  # move right (towards goal)
            self.cur_pos -= 1
        elif action == 1:  # move left (towards start)
            self.cur_pos += 1
        if self.cur_pos >= self.end_pos:
            return 0, 1.0, True, {}
        else:
            return self.cur_pos, -0.1, False, {}


ray.init()
config = {
    "env": SimpleCorridor,
    "env_config": {
        "corridor_length": 5,
    },
}

trainer = ppo.PPO(config=config)
for _ in range(3):
    print(trainer.train())
# __rllib-custom-gym-env-end__
Esempio n. 19
0
        "eager_tracing": args.eager_tracing,
    }

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }

    # manual training loop (no Ray tune)
    if args.no_tune:
        if args.run not in {"APPO", "PPO"}:
            raise ValueError("This example only supports APPO and PPO.")
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update(config)
        trainer = ppo.PPO(config=ppo_config, env=ActionMaskEnv)
        # run manual training loop and print results after each iteration
        for _ in range(args.stop_iters):
            result = trainer.train()
            print(pretty_print(result))
            # stop training if the target train steps or reward are reached
            if (result["timesteps_total"] >= args.stop_timesteps
                    or result["episode_reward_mean"] >= args.stop_reward):
                break

        # manual test loop
        print("Finished training. Running manual test/inference loop.")
        # prepare environment with max 10 steps
        config["env_config"]["max_episode_len"] = 10
        env = ActionMaskEnv(config["env_config"])
        obs = env.reset()
Esempio n. 20
0
    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }

    if args.no_tune:
        # manual training with train loop using PPO and fixed learning rate
        if args.run != "PPO":
            raise ValueError("Only support --run PPO with --no-tune.")
        print("Running manual train loop without Ray Tune.")
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update(config)
        # use fixed learning rate instead of grid search (needs tune)
        ppo_config["lr"] = 1e-3
        trainer = ppo.PPO(config=ppo_config, env=SimpleCorridor)
        # run manual training loop and print results after each iteration
        for _ in range(args.stop_iters):
            result = trainer.train()
            print(pretty_print(result))
            # stop training of the target train steps or reward are reached
            if (
                result["timesteps_total"] >= args.stop_timesteps
                or result["episode_reward_mean"] >= args.stop_reward
            ):
                break
    else:
        # automated run with Tune and grid search and TensorBoard
        print("Training automatically with Ray Tune")
        results = tune.run(args.run, config=config, stop=stop)
Esempio n. 21
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = (ppo.PPOConfig().rollouts(num_rollout_workers=0, ).training(
            gamma=0.99,
            model=dict(
                fcnet_hiddens=[10],
                fcnet_activation="linear",
                vf_share_layers=True,
            ),
        ))

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPO(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check no free log std var by default.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters()
                    if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables()
                    if "log_std" in str(v)
                ]
            assert len(matching) == 0, matching

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            train_batch = compute_gae_for_sample_batch(policy,
                                                       FAKE_BATCH.copy())
            if fw == "torch":
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw in ["tf2", "tfe"]:
                PPOTF2Policy.loss(policy, policy.model, Categorical,
                                  train_batch)
            elif fw == "torch":
                PPOTorchPolicy.loss(policy, policy.model, policy.dist_class,
                                    train_batch)

            vars = (policy.model.variables()
                    if fw != "torch" else list(policy.model.parameters()))
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(
                train_batch[SampleBatch.CUR_OBS],
                vars[0 if fw != "torch" else 2],
                vars[1 if fw != "torch" else 3],
                framework=fw,
            )
            expected_logits = fc(
                expected_shared_out,
                vars[2 if fw != "torch" else 0],
                vars[3 if fw != "torch" else 1],
                framework=fw,
            )
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = self._ppo_loss_helper(
                policy,
                policy.model,
                Categorical if fw != "torch" else TorchCategorical,
                train_batch,
                expected_logits,
                expected_value_outs,
                sess=sess,
            )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy._mean_kl_loss,
                        policy._mean_entropy,
                        policy._mean_policy_loss,
                        policy._mean_vf_loss,
                        policy._total_loss,
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False),
                )
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            elif fw == "torch":
                check(policy.model.tower_stats["mean_kl_loss"], kl)
                check(policy.model.tower_stats["mean_entropy"], entropy)
                check(policy.model.tower_stats["mean_policy_loss"],
                      np.mean(-pg_loss))
                check(
                    policy.model.tower_stats["mean_vf_loss"],
                    np.mean(vf_loss),
                    decimals=4,
                )
                check(policy.model.tower_stats["total_loss"],
                      overall_loss,
                      decimals=4)
            else:
                check(policy._mean_kl_loss, kl)
                check(policy._mean_entropy, entropy)
                check(policy._mean_policy_loss, np.mean(-pg_loss))
                check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4)
                check(policy._total_loss, overall_loss, decimals=4)
            trainer.stop()
Esempio n. 22
0
outdir = "export_torch"
if os.path.exists(outdir):
    shutil.rmtree(outdir)

np.random.seed(1234)

# We will run inference with this test batch
test_data = {
    "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32),
    "state_ins": np.array([0.0], dtype=np.float32),
}

# Start Ray and initialize a PPO trainer
ray.init()
trainer = ppo.PPO(config=config, env="CartPole-v0")

# You could train the model here
# trainer.train()

# Let's run inference on the torch model
policy = trainer.get_policy()
result_pytorch, _ = policy.model(
    {
        "obs": torch.tensor(test_data["obs"]),
    }
)

# Evaluate tensor to fetch numpy array
result_pytorch = result_pytorch.detach().numpy()
Esempio n. 23
0
    "PPO",
    stop={"timesteps_total": train_steps},
    config={
        "env": env_name,
        "lr": learning_rate
    },
    checkpoint_at_end=True,
    local_dir=save_dir,
)
# retrieve the checkpoint path
analysis.default_metric = "episode_reward_mean"
analysis.default_mode = "max"
checkpoint_path = analysis.get_best_checkpoint(trial=analysis.get_best_trial())
print(f"Trained model saved at {checkpoint_path}")

# load and restore model
agent = ppo.PPO(env=env_name)
agent.restore(checkpoint_path)
print(f"Agent loaded from saved model at {checkpoint_path}")

# inference
env = gym.make(env_name)
obs = env.reset()
for i in range(1000):
    action = agent.compute_single_action(obs)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        print(f"Cart pole dropped after {i} steps.")
        break
Esempio n. 24
0
    }

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }

    # Manual training loop (no Ray tune).
    if args.no_tune:
        # manual training loop using PPO and manually keeping track of state
        if args.run != "PPO":
            raise ValueError("Only support --run PPO with --no-tune.")
        ppo_config = ppo.DEFAULT_CONFIG.copy()
        ppo_config.update(config)
        algo = ppo.PPO(config=ppo_config, env=args.env)
        # run manual training loop and print results after each iteration
        for _ in range(args.stop_iters):
            result = algo.train()
            print(pretty_print(result))
            # stop training if the target train steps or reward are reached
            if (result["timesteps_total"] >= args.stop_timesteps
                    or result["episode_reward_mean"] >= args.stop_reward):
                break

        # Run manual test loop (only for RepeatAfterMe env).
        if args.env == "RepeatAfterMeEnv":
            print("Finished training. Running manual test/inference loop.")
            # prepare env
            env = RepeatAfterMeEnv(config["env_config"])
            obs = env.reset()