Ejemplo n.º 1
0
    def test_traj_view_simple_performance(self):
        """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`.
        """
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        action_space = Discrete(2)
        obs_space = Box(-1.0, 1.0, shape=(700, ))

        from ray.rllib.examples.env.random_env import RandomMultiAgentEnv
        from ray.tune import register_env
        register_env(
            "ma_env",
            lambda c: RandomMultiAgentEnv({
                "num_agents": 2,
                "p_done": 0.0,
                "max_episode_len": 104,
                "action_space": action_space,
                "observation_space": obs_space
            }))

        config["num_workers"] = 3
        config["num_envs_per_worker"] = 8
        config["num_sgd_iter"] = 1  # Put less weight on training.

        policies = {
            "pol0": (None, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": policy_fn,
        }
        num_iterations = 2
        for _ in framework_iterator(config, frameworks="torch"):
            print("w/ traj. view API")
            config["_use_trajectory_view_api"] = True
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_w = 0.0
            sampler_perf_w = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_w = {
                    k:
                    sampler_perf_w.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_w += delta
                print("{}={}s".format(i, delta))
            sampler_perf_w = {
                k: sampler_perf_w[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_w.items()
            }
            duration_w = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_w, sampler_perf_w,
                      learn_time_w / num_iterations))
            trainer.stop()

            print("w/o traj. view API")
            config["_use_trajectory_view_api"] = False
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_wo = 0.0
            sampler_perf_wo = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_wo = {
                    k: sampler_perf_wo.get(k, 0.0) +
                    (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_wo += delta
                print("{}={}s".format(i, delta))
            sampler_perf_wo = {
                k: sampler_perf_wo[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_wo.items()
            }
            duration_wo = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_wo, sampler_perf_wo,
                      learn_time_wo / num_iterations))
            trainer.stop()

            # Assert `_use_trajectory_view_api` is faster.
            self.assertLess(sampler_perf_w["mean_raw_obs_processing_ms"],
                            sampler_perf_wo["mean_raw_obs_processing_ms"])
            self.assertLess(sampler_perf_w["mean_action_processing_ms"],
                            sampler_perf_wo["mean_action_processing_ms"])
            self.assertLess(duration_w, duration_wo)
Ejemplo n.º 2
0
    default=10,
    help="The frequency with which to create checkpoint files of the learnt "
    "Policies.")
parser.add_argument("--no-restore",
                    action="store_true",
                    help="Whether to load the Policy "
                    "weights from a previous checkpoint")

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    # Create a fake-env for the server. This env will never be used (neither
    # for sampling, nor for evaluation) and its obs/action Spaces do not
    # matter either (multi-agent config below defines Spaces per Policy).
    register_env("fake_unity", lambda c: RandomMultiAgentEnv(c))

    policies, policy_mapping_fn = \
        Unity3DEnv.get_policy_configs_for_game(args.env)

    # The entire config will be sent to connecting clients so they can
    # build their own samplers (and also Policy objects iff
    # `inference_mode=local` on clients' command line).
    config = {
        # Use the connector server to generate experiences.
        "input":
        (lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, args.port)),
        # Use a single worker process (w/ SyncSampler) to run the server.
        "num_workers":
        0,
        # Disable OPE, since the rollouts are coming from online clients.
Ejemplo n.º 3
0
    def test_traj_view_lstm_performance(self):
        """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`.
        """
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        action_space = Discrete(2)
        obs_space = Box(-1.0, 1.0, shape=(700, ))

        from ray.rllib.examples.env.random_env import RandomMultiAgentEnv

        from ray.tune import register_env
        register_env(
            "ma_env",
            lambda c: RandomMultiAgentEnv({
                "num_agents": 2,
                "p_done": 0.01,
                "action_space": action_space,
                "observation_space": obs_space
            }))

        config["num_workers"] = 3
        config["num_envs_per_worker"] = 8
        config["num_sgd_iter"] = 6
        config["model"]["use_lstm"] = True
        config["model"]["lstm_use_prev_action_reward"] = True
        config["model"]["max_seq_len"] = 100

        policies = {
            "pol0": (None, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": policy_fn,
        }
        num_iterations = 1
        # Only works in torch so far.
        for _ in framework_iterator(config, frameworks="torch"):
            print("w/ traj. view API (and time-major)")
            config["_use_trajectory_view_api"] = True
            config["model"]["_time_major"] = True
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_w = 0.0
            sampler_perf = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                sampler_perf_ = out["sampler_perf"]
                sampler_perf = {
                    k: sampler_perf.get(k, 0.0) + sampler_perf_[k]
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / 1000
                learn_time_w += delta
                print("{}={}s".format(i, delta))
            sampler_perf = {
                k: sampler_perf[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf.items()
            }
            duration_w = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_w, sampler_perf, learn_time_w / num_iterations))
            trainer.stop()

            print("w/o traj. view API (and w/o time-major)")
            config["_use_trajectory_view_api"] = False
            config["model"]["_time_major"] = False
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_wo = 0.0
            sampler_perf = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                sampler_perf_ = out["sampler_perf"]
                sampler_perf = {
                    k: sampler_perf.get(k, 0.0) + sampler_perf_[k]
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / 1000
                learn_time_wo += delta
                print("{}={}s".format(i, delta))
            sampler_perf = {
                k: sampler_perf[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf.items()
            }
            duration_wo = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_wo, sampler_perf,
                      learn_time_wo / num_iterations))
            trainer.stop()

            # Assert `_use_trajectory_view_api` is much faster.
            self.assertLess(duration_w, duration_wo)
            self.assertLess(learn_time_w, learn_time_wo * 0.6)
Ejemplo n.º 4
0
    # Make sure all policies' obs- and action spaces are the same.
    # If not, we won't be able to mimic the Unity3D env using RLlib's
    # RandomMultiAgentEnv.
    first_policy_spec = next(iter(policies.values()))
    for pid, policy_spec in policies.items():
        assert policy_spec.observation_space == first_policy_spec.observation_space
        assert policy_spec.action_space == first_policy_spec.action_space

    # Start and reset the actual Unity3DEnv (either already running Unity3D
    # editor or a binary (game) to be started automatically).
    env = RandomMultiAgentEnv({
        # Same number of agents as the actual Unity3D game would have.
        "num_agents": len(policies),
        # Make sure we stick to the user given horizons using our
        # RandomMultiAgentEnv options.
        "max_episode_len": args.horizon,
        "p_done": 0.0,
        # Same obs- action spaces as the actual Unity3D game would have.
        "observation_space": first_policy_spec.observation_space,
        "action_space": first_policy_spec.action_space,
    })
    obs = env.reset()
    eid = client.start_episode(training_enabled=not args.no_train)

    # Keep track of the total reward per episode.
    total_rewards_this_episode = 0.0

    # Loop through the env until n episodes completed.
    num_episodes = 0
    while True:
        # Get actions from the Policy server given our current obs.
Ejemplo n.º 5
0
def init_policy_server(config):
    ''' Start the policy serve r that receives (state, action, reward) batches
        and computes gradients for RL '''
    # By default, Ray will parallelize its workload. However, if you need to debug your Ray program,
    # it may be easier to do everything on a single process. You can force all Ray functions to occur
    # on a single process with local_mode=True.
    memory_limit = 1024 * 1024 * 1024 * 10  # GB
    ray.init(local_mode=config["debug_mode"], object_store_memory=memory_limit)

    trainer_config = config["trainer_config"]
    assert torch.cuda.device_count() >= trainer_config["num_gpus"]
    trainer_config["input"] = lambda ioctx: PolicyServerInput(
        ioctx, config["policy_server_host"], int(config["policy_server_port"]))

    custom_model = config.get("custom_model", None)
    if custom_model:
        register_custom_model(custom_model)
        trainer_config["model"]["custom_model"] = custom_model

    agent_obs_space, agent_action_space = \
        make_observation_space_and_action_space(config, True)
    if config["multiagent"]:
        user_obs_space, user_action_space = \
            make_observation_space_and_action_space(config, False)
        trainer_config["multiagent"] = {
            "policies": {
                # the first tuple value is None -> uses default policy
                "agent": (None, agent_obs_space, agent_action_space, {}),
                "user": (None, user_obs_space, user_action_space, {})
            },
            "policy_mapping_fn": lambda agent_id: agent_id
        }
        from ray.rllib.examples.env.random_env import RandomMultiAgentEnv
        # Create a fake env for the server. This env will never be used (neither
        # for sampling, nor for evaluation) and its obs/action Spaces do not
        # matter either (multi-agent config above defines Spaces per Policy).
        register_env("custom_env", lambda c: RandomMultiAgentEnv(c))
    else:

        def custom_env(env_config):
            ''' Create an env stub for policy training.  Only the 
                action_space and observation_space attributes need to be defined,
                no other env functions are needed (e.g. step(), reset(), etc). '''
            env = gym.Env()
            env.action_space = agent_action_space
            env.observation_space = agent_obs_space
            return env

        register_env("custom_env", custom_env)

    trainer = SUPPORTED_TRAINER_CLASSES[config["trainer_class"]](
        env="custom_env", config=trainer_config)

    os.makedirs(config["model_checkpoint_dir"], exist_ok=True)
    # Attempt to restore from checkpoint if possible.
    latest_checkpoint = os.path.join(config["model_checkpoint_dir"],
                                     "latest_ckpt")
    if os.path.exists(latest_checkpoint):
        latest_checkpoint = open(latest_checkpoint).read()
        print("Restoring from checkpoint", latest_checkpoint)
        trainer.restore(latest_checkpoint)

    # Serving and training loop
    print("######## Training loop begins... ########")
    count = 0
    while True:
        train_log = trainer.train()
        # print(pretty_print(train_log))
        count += 1
        if count % config["checkpoint_freq"] == 0:
            checkpoint = trainer.save()
            print("#### Writing checkpoint for train iteration", count, "####")
            with open(latest_checkpoint, "w") as f:
                f.write(checkpoint)