Exemple #1
0
    def setup(self, config: PartialTrainerConfigDict):
        super().setup(config)

        # Shortcut: If execution_plan, thread and buffer will be created in there.
        if self.config["_disable_execution_plan_api"] is False:
            return

        # Tag those workers (top 1/3rd indices) that we should collect episodes from
        # for metrics due to `PerWorkerEpsilonGreedy` exploration strategy.
        if self.workers.remote_workers():
            self._remote_workers_for_metrics = self.workers.remote_workers(
            )[-len(self.workers.remote_workers()) // 3:]

        num_replay_buffer_shards = self.config["optimizer"][
            "num_replay_buffer_shards"]

        # Create copy here so that we can modify without breaking other logic
        replay_actor_config = copy.deepcopy(
            self.config["replay_buffer_config"])

        replay_actor_config["capacity"] = (
            self.config["replay_buffer_config"]["capacity"] //
            num_replay_buffer_shards)

        ReplayActor = ray.remote(num_cpus=0)(replay_actor_config["type"])

        # Place all replay buffer shards on the same node as the learner
        # (driver process that runs this execution plan).
        if replay_actor_config["replay_buffer_shards_colocated_with_driver"]:
            self.replay_actors = create_colocated_actors(
                actor_specs=[  # (class, args, kwargs={}, count)
                    (
                        ReplayActor,
                        None,
                        replay_actor_config,
                        num_replay_buffer_shards,
                    )
                ],
                node=platform.node(),  # localhost
            )[0]  # [0]=only one item in `actor_specs`.
        # Place replay buffer shards on any node(s).
        else:
            self.replay_actors = [
                ReplayActor.remote(*replay_actor_config)
                for _ in range(num_replay_buffer_shards)
            ]
        self.learner_thread = LearnerThread(self.workers.local_worker())
        self.learner_thread.start()
        self.steps_since_update = defaultdict(int)
        weights = self.workers.local_worker().get_weights()
        self.curr_learner_weights = ray.put(weights)
        self.remote_sampling_requests_in_flight: DefaultDict[
            ActorHandle, Set[ray.ObjectRef]] = defaultdict(set)
        self.remote_replay_requests_in_flight: DefaultDict[
            ActorHandle, Set[ray.ObjectRef]] = defaultdict(set)
        self.curr_num_samples_collected = 0
        self.replay_sample_batches = []
        self._num_ts_trained_since_last_target_update = 0
Exemple #2
0
    def setup(self, config: PartialTrainerConfigDict):
        super().setup(config)
        self.remote_sampling_requests_in_flight: DefaultDict[
            ActorHandle, Set[ray.ObjectRef]] = defaultdict(set)

        if self.config["_disable_execution_plan_api"]:
            # Create extra aggregation workers and assign each rollout worker to
            # one of them.
            self.batches_to_place_on_learner = []
            self.batch_being_built = []
            if self.config["num_aggregation_workers"] > 0:
                # This spawns `num_aggregation_workers` actors that aggregate
                # experiences coming from RolloutWorkers in parallel. We force
                # colocation on the same node (localhost) to maximize data bandwidth
                # between them and the learner.
                localhost = platform.node()
                assert localhost != "", (
                    "ERROR: Cannot determine local node name! "
                    "`platform.node()` returned empty string.")
                all_co_located = create_colocated_actors(
                    actor_specs=[
                        # (class, args, kwargs={}, count=1)
                        (
                            AggregatorWorker,
                            [
                                self.config,
                            ],
                            {},
                            self.config["num_aggregation_workers"],
                        )
                    ],
                    node=localhost,
                )
                self.aggregator_workers = [
                    actor for actor_groups in all_co_located
                    for actor in actor_groups
                ]
                self.remote_aggregator_requests_in_flight: DefaultDict[
                    ActorHandle, Set[ray.ObjectRef]] = defaultdict(set)

            else:
                # Create our local mixin buffer if the num of aggregation workers is 0.
                self.local_mixin_buffer = MixInMultiAgentReplayBuffer(
                    capacity=(self.config["replay_buffer_num_slots"]
                              if self.config["replay_buffer_num_slots"] > 0
                              else 1),
                    replay_ratio=self.config["replay_ratio"],
                )

            # Create and start the learner thread.
            self._learner_thread = make_learner_thread(
                self.workers.local_worker(), self.config)
            self._learner_thread.start()
            self.workers_that_need_updates = set()
Exemple #3
0
def gather_experiences_tree_aggregation(workers: WorkerSet,
                                        config: Dict) -> "LocalIterator[Any]":
    """Tree aggregation version of gather_experiences_directly()."""

    rollouts = ParallelRollouts(workers, mode="raw")

    # Divide up the workers between aggregators.
    worker_assignments = [[] for _ in range(config["num_aggregation_workers"])]
    i = 0
    for worker_idx in range(len(workers.remote_workers())):
        worker_assignments[i].append(worker_idx)
        i += 1
        i %= len(worker_assignments)
    logger.info("Worker assignments: {}".format(worker_assignments))

    # Create parallel iterators that represent each aggregation group.
    rollout_groups: List["ParallelIterator[SampleBatchType]"] = [
        rollouts.select_shards(assigned) for assigned in worker_assignments
    ]

    # This spawns |num_aggregation_workers| intermediate actors that aggregate
    # experiences in parallel. We force colocation on the same node (localhost)
    # to maximize data bandwidth between them and the driver.
    localhost = platform.node()
    assert localhost != "", ("ERROR: Cannot determine local node name! "
                             "`platform.node()` returned empty string.")
    all_co_located = create_colocated_actors(
        actor_specs=[
            # (class, args, kwargs={}, count=1)
            (Aggregator, [config, g], {}, 1) for g in rollout_groups
        ],
        node=localhost,
    )

    # Use the first ([0]) of each created group (each group only has one
    # actor: count=1).
    train_batches = from_actors([group[0] for group in all_co_located])

    # TODO(ekl) properly account for replay.
    def record_steps_sampled(batch):
        metrics = _get_shared_metrics()
        metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count
        if isinstance(batch, MultiAgentBatch):
            metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.agent_steps(
            )
        else:
            metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.count
        return batch

    return train_batches.gather_async().for_each(record_steps_sampled)
Exemple #4
0
    def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec):
        # Merge the policies config overrides with the main config.
        # Also, adjust `num_gpus` (to indicate an individual policy's
        # num_gpus, not the total number of GPUs).
        cfg = Algorithm.merge_trainer_configs(
            self.config,
            dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}),
        )

        # Need to create the replay actor first. Then add the first policy.
        if self.replay_actor is None:
            return self._add_replay_buffer_and_policy(policy_id, policy_spec,
                                                      cfg)

        # Replay actor already exists -> Just add a new policy here.

        assert len(self.policy_actors) < self.max_num_policies

        actual_policy_class = get_tf_eager_cls_if_necessary(
            policy_spec.policy_class, cfg)

        colocated = create_colocated_actors(
            actor_specs=[(
                ray.remote(
                    num_cpus=1,
                    num_gpus=self.num_gpus_per_policy
                    if not cfg["_fake_gpus"] else 0,
                )(actual_policy_class),
                # Policy c'tor args.
                (policy_spec.observation_space, policy_spec.action_space, cfg),
                # Policy c'tor kwargs={}.
                {},
                # Count=1,
                1,
            )],
            # Force co-locate on the already existing replay actor's node.
            node=ray.get(self.replay_actor.get_host.remote()),
        )

        self.policy_actors[policy_id] = colocated[0][0]

        return self.policy_actors[policy_id]
Exemple #5
0
    def _add_replay_buffer_and_policy(
        self,
        policy_id: PolicyID,
        policy_spec: PolicySpec,
        config: AlgorithmConfigDict,
    ):
        assert self.replay_actor is None
        assert len(self.policy_actors) == 0

        actual_policy_class = get_tf_eager_cls_if_necessary(
            policy_spec.policy_class, config)

        colocated = create_colocated_actors(
            actor_specs=[
                (self.replay_actor_class, self.replay_actor_args, {}, 1),
            ] + [(
                ray.remote(
                    num_cpus=1,
                    num_gpus=self.num_gpus_per_policy
                    if not config["_fake_gpus"] else 0,
                )(actual_policy_class),
                # Policy c'tor args.
                (policy_spec.observation_space, policy_spec.action_space,
                 config),
                # Policy c'tor kwargs={}.
                {},
                # Count=1,
                1,
            )],
            node=None,
        )  # None

        self.replay_actor = colocated[0][0]
        self.policy_actors[policy_id] = colocated[1][0]
        self.has_replay_buffer = True

        return self.policy_actors[policy_id]
Exemple #6
0
    def execution_plan(workers: WorkerSet, config: dict,
                       **kwargs) -> LocalIterator[dict]:
        assert (
            len(kwargs) == 0
        ), "Apex execution_plan does NOT take any additional parameters"

        # Create a number of replay buffer actors.
        num_replay_buffer_shards = config["optimizer"][
            "num_replay_buffer_shards"]
        buffer_size = (config["replay_buffer_config"]["capacity"] //
                       num_replay_buffer_shards)
        replay_actor_args = [
            num_replay_buffer_shards,
            config["learning_starts"],
            buffer_size,
            config["train_batch_size"],
            config["replay_buffer_config"]["prioritized_replay_alpha"],
            config["replay_buffer_config"]["prioritized_replay_beta"],
            config["replay_buffer_config"]["prioritized_replay_eps"],
            config["multiagent"]["replay_mode"],
            config["replay_buffer_config"].get("replay_sequence_length", 1),
        ]
        # Place all replay buffer shards on the same node as the learner
        # (driver process that runs this execution plan).
        if config["replay_buffer_shards_colocated_with_driver"]:
            replay_actors = create_colocated_actors(
                actor_specs=[
                    # (class, args, kwargs={}, count)
                    (ReplayActor, replay_actor_args, {},
                     num_replay_buffer_shards)
                ],
                node=platform.node(),  # localhost
            )[0]  # [0]=only one item in `actor_specs`.
        # Place replay buffer shards on any node(s).
        else:
            replay_actors = [
                ReplayActor(*replay_actor_args)
                for _ in range(num_replay_buffer_shards)
            ]

        # Start the learner thread.
        learner_thread = LearnerThread(workers.local_worker())
        learner_thread.start()

        # Update experience priorities post learning.
        def update_prio_and_stats(
                item: Tuple[ActorHandle, dict, int, int]) -> None:
            actor, prio_dict, env_count, agent_count = item
            if config.get("prioritized_replay"):
                actor.update_priorities.remote(prio_dict)
            metrics = _get_shared_metrics()
            # Manually update the steps trained counter since the learner
            # thread is executing outside the pipeline.
            metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = env_count
            metrics.counters[STEPS_TRAINED_COUNTER] += env_count
            metrics.timers["learner_dequeue"] = learner_thread.queue_timer
            metrics.timers["learner_grad"] = learner_thread.grad_timer
            metrics.timers["learner_overall"] = learner_thread.overall_timer

        # We execute the following steps concurrently:
        # (1) Generate rollouts and store them in one of our replay buffer
        # actors. Update the weights of the worker that generated the batch.
        rollouts = ParallelRollouts(workers, mode="async", num_async=2)
        store_op = rollouts.for_each(StoreToReplayBuffer(actors=replay_actors))
        # Only need to update workers if there are remote workers.
        if workers.remote_workers():
            store_op = store_op.zip_with_source_actor().for_each(
                UpdateWorkerWeights(
                    learner_thread,
                    workers,
                    max_weight_sync_delay=(
                        config["optimizer"]["max_weight_sync_delay"]),
                ))

        # (2) Read experiences from one of the replay buffer actors and send
        # to the learner thread via its in-queue.
        post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
        replay_op = (Replay(
            actors=replay_actors, num_async=4).for_each(lambda x: post_fn(
                x, workers, config)).zip_with_source_actor().for_each(
                    Enqueue(learner_thread.inqueue)))

        # (3) Get priorities back from learner thread and apply them to the
        # replay buffer actors.
        update_op = (Dequeue(learner_thread.outqueue,
                             check=learner_thread.is_alive).for_each(
                                 update_prio_and_stats).for_each(
                                     UpdateTargetNetwork(
                                         workers,
                                         config["target_network_update_freq"],
                                         by_steps_trained=True)))

        if config["training_intensity"]:
            # Execute (1), (2) with a fixed intensity ratio.
            rr_weights = calculate_rr_weights(config) + ["*"]
            merged_op = Concurrently(
                [store_op, replay_op, update_op],
                mode="round_robin",
                output_indexes=[2],
                round_robin_weights=rr_weights,
            )
        else:
            # Execute (1), (2), (3) asynchronously as fast as possible. Only
            # output items from (3) since metrics aren't available before
            # then.
            merged_op = Concurrently([store_op, replay_op, update_op],
                                     mode="async",
                                     output_indexes=[2])

        # Add in extra replay and learner metrics to the training result.
        def add_apex_metrics(result: dict) -> dict:
            replay_stats = ray.get(replay_actors[0].stats.remote(
                config["optimizer"].get("debug")))
            exploration_infos = workers.foreach_policy_to_train(
                lambda p, _: p.get_exploration_state())
            result["info"].update({
                "exploration_infos":
                exploration_infos,
                "learner_queue":
                learner_thread.learner_queue_size.stats(),
                LEARNER_INFO:
                copy.deepcopy(learner_thread.learner_info),
                "replay_shard_0":
                replay_stats,
            })
            return result

        # Only report metrics from the workers with the lowest 1/3 of
        # epsilons.
        selected_workers = workers.remote_workers(
        )[-len(workers.remote_workers()) // 3:]

        return StandardMetricsReporting(
            merged_op, workers, config,
            selected_workers=selected_workers).for_each(add_apex_metrics)