コード例 #1
0
def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # Collect large batches of relevant experiences & standardize.
    rollouts = rollouts.for_each(
        SelectExperiences(workers.trainable_policies()))
    rollouts = rollouts.combine(
        ConcatBatches(min_batch_size=config["train_batch_size"]))
    rollouts = rollouts.for_each(StandardizeFields(["advantages"]))

    if config["simple_optimizer"]:
        train_op = rollouts.for_each(
            TrainOneStep(
                workers,
                num_sgd_iter=config["num_sgd_iter"],
                sgd_minibatch_size=config["sgd_minibatch_size"]))
    else:
        train_op = rollouts.for_each(
            TrainTFMultiGPU(
                workers,
                sgd_minibatch_size=config["sgd_minibatch_size"],
                num_sgd_iter=config["num_sgd_iter"],
                num_gpus=config["num_gpus"],
                rollout_fragment_length=config["rollout_fragment_length"],
                num_envs_per_worker=config["num_envs_per_worker"],
                train_batch_size=config["train_batch_size"],
                shuffle_sequences=config["shuffle_sequences"],
                _fake_gpus=config["_fake_gpus"]))

    # Update KL after each round of training.
    train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers))

    return StandardMetricsReporting(train_op, workers, config) \
        .for_each(lambda result: warn_about_bad_reward_scales(config, result))
コード例 #2
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the PPO algorithm. Defines the distributed dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: The Policy class to use with PPOTrainer.
            If None, use `default_policy` provided in build_trainer().
    """
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # Collect batches for the trainable policies.
    rollouts = rollouts.for_each(
        SelectExperiences(workers.trainable_policies()))
    # Concatenate the SampleBatches into one.
    rollouts = rollouts.combine(
        ConcatBatches(
            min_batch_size=config["train_batch_size"],
            count_steps_by=config["multiagent"]["count_steps_by"],
        ))
    # Standardize advantages.
    rollouts = rollouts.for_each(StandardizeFields(["advantages"]))

    # Perform one training step on the combined + standardized batch.
    if config["simple_optimizer"]:
        train_op = rollouts.for_each(
            TrainOneStep(
                workers,
                num_sgd_iter=config["num_sgd_iter"],
                sgd_minibatch_size=config["sgd_minibatch_size"]))
    else:
        train_op = rollouts.for_each(
            TrainTFMultiGPU(
                workers,
                sgd_minibatch_size=config["sgd_minibatch_size"],
                num_sgd_iter=config["num_sgd_iter"],
                num_gpus=config["num_gpus"],
                rollout_fragment_length=config["rollout_fragment_length"],
                num_envs_per_worker=config["num_envs_per_worker"],
                train_batch_size=config["train_batch_size"],
                shuffle_sequences=config["shuffle_sequences"],
                _fake_gpus=config["_fake_gpus"],
                framework=config.get("framework")))

    # Update KL after each round of training.
    train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers))

    # Warn about bad reward scales and return training metrics.
    return StandardMetricsReporting(train_op, workers, config) \
        .for_each(lambda result: warn_about_bad_reward_scales(config, result))
コード例 #3
0
ファイル: test_execution.py プロジェクト: wuisawesome/ray
 def test_standardize(self):
     workers = make_workers(0)
     a = ParallelRollouts(workers, mode="async")
     b = a.for_each(StandardizeFields([SampleBatch.EPS_ID]))
     batch = next(b)
     assert abs(np.mean(batch[SampleBatch.EPS_ID])) < 0.001, batch
     assert abs(np.std(batch[SampleBatch.EPS_ID]) - 1.0) < 0.001, batch
コード例 #4
0
    def test_store_to_replay_actor(self):
        ReplayActor = ray.remote(num_cpus=0)(MultiAgentReplayBuffer)
        actor = ReplayActor.remote(
            num_shards=1,
            learning_starts=200,
            capacity=1000,
            replay_batch_size=100,
            prioritized_replay_alpha=0.6,
            prioritized_replay_beta=0.4,
            prioritized_replay_eps=0.0001,
        )
        assert len(ray.get(actor.sample.remote(100))) == 0

        workers = make_workers(0)
        a = ParallelRollouts(workers, mode="bulk_sync")
        b = a.for_each(StoreToReplayBuffer(actors=[actor]))

        next(b)
        assert len(ray.get(
            actor.sample.remote(100))) == 0  # learning hasn't started
        next(b)
        assert ray.get(actor.sample.remote(100)).count == 100

        replay_op = Replay(actors=[actor])
        assert next(replay_op).count == 100
コード例 #5
0
ファイル: simple_q.py プロジェクト: longshotsyndicate/ray
    def execution_plan(workers, config, **kwargs):
        assert "local_replay_buffer" in kwargs, (
            "GenericOffPolicy execution plan requires a local replay buffer.")

        local_replay_buffer = kwargs["local_replay_buffer"]

        rollouts = ParallelRollouts(workers, mode="bulk_sync")

        # (1) Generate rollouts and store them in our local replay buffer.
        store_op = rollouts.for_each(
            StoreToReplayBuffer(local_buffer=local_replay_buffer))

        if config["simple_optimizer"]:
            train_step_op = TrainOneStep(workers)
        else:
            train_step_op = MultiGPUTrainOneStep(
                workers=workers,
                sgd_minibatch_size=config["train_batch_size"],
                num_sgd_iter=1,
                num_gpus=config["num_gpus"],
                _fake_gpus=config["_fake_gpus"])

        # (2) Read and train on experiences from the replay buffer.
        replay_op = Replay(local_buffer=local_replay_buffer) \
            .for_each(train_step_op) \
            .for_each(UpdateTargetNetwork(
                workers, config["target_network_update_freq"]))

        # Alternate deterministically between (1) and (2).
        train_op = Concurrently([store_op, replay_op],
                                mode="round_robin",
                                output_indexes=[1])

        return StandardMetricsReporting(train_op, workers, config)
コード例 #6
0
    def execution_plan(workers, config, **kwargs):
        assert (
            len(kwargs) == 0
        ), "Dreamer execution_plan does NOT take any additional parameters"

        # Special replay buffer for Dreamer agent.
        episode_buffer = EpisodicBuffer(length=config["batch_length"])

        local_worker = workers.local_worker()

        # Prefill episode buffer with initial exploration (uniform sampling)
        while total_sampled_timesteps(
                local_worker) < config["prefill_timesteps"]:
            samples = local_worker.sample()
            episode_buffer.add(samples)

        batch_size = config["batch_size"]
        dreamer_train_iters = config["dreamer_train_iters"]
        act_repeat = config["action_repeat"]

        rollouts = ParallelRollouts(workers)
        rollouts = rollouts.for_each(
            DreamerIteration(
                local_worker,
                episode_buffer,
                dreamer_train_iters,
                batch_size,
                act_repeat,
            ))
        return rollouts
コード例 #7
0
    def execution_plan(
        workers: WorkerSet, config: TrainerConfigDict, **kwargs
    ) -> LocalIterator[dict]:
        assert (
            len(kwargs) == 0
        ), "Marwill execution_plan does NOT take any additional parameters"

        rollouts = ParallelRollouts(workers, mode="bulk_sync")
        replay_buffer = MultiAgentReplayBuffer(
            learning_starts=config["learning_starts"],
            capacity=config["replay_buffer_size"],
            replay_batch_size=config["train_batch_size"],
            replay_sequence_length=1,
        )

        store_op = rollouts.for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

        replay_op = (
            Replay(local_buffer=replay_buffer)
            .combine(
                ConcatBatches(
                    min_batch_size=config["train_batch_size"],
                    count_steps_by=config["multiagent"]["count_steps_by"],
                )
            )
            .for_each(TrainOneStep(workers))
        )

        train_op = Concurrently(
            [store_op, replay_op], mode="round_robin", output_indexes=[1]
        )

        return StandardMetricsReporting(train_op, workers, config)
コード例 #8
0
ファイル: slateq.py プロジェクト: wuisawesome/ray
    def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                       **kwargs) -> LocalIterator[dict]:
        assert (
            "local_replay_buffer"
            in kwargs), "SlateQ execution plan requires a local replay buffer."

        rollouts = ParallelRollouts(workers, mode="bulk_sync")

        # We execute the following steps concurrently:
        # (1) Generate rollouts and store them in our local replay buffer.
        # Calling next() on store_op drives this.
        store_op = rollouts.for_each(
            StoreToReplayBuffer(local_buffer=kwargs["local_replay_buffer"]))

        # (2) Read and train on experiences from the replay buffer. Every batch
        # returned from the LocalReplay() iterator is passed to TrainOneStep to
        # take a SGD step.
        replay_op = (Replay(
            local_buffer=kwargs["local_replay_buffer"]).for_each(
                TrainOneStep(workers)).for_each(
                    UpdateTargetNetwork(workers,
                                        config["target_network_update_freq"])))

        # Alternate deterministically between (1) and (2). Only return the
        # output of (2) since training metrics are not available until (2)
        # runs.
        train_op = Concurrently(
            [store_op, replay_op],
            mode="round_robin",
            output_indexes=[1],
            round_robin_weights=calculate_round_robin_weights(config),
        )

        return StandardMetricsReporting(train_op, workers, config)
コード例 #9
0
def execution_plan_nfsp(workers, config):
    # 1. define buffers
    replay_size = config["replay_buffer_size"]
    reservoir_size = config["reservoir_buffer_size"]
    replay_buffers = MultiAgentSimpleReplayBuffer(
        replay_size, config["multiagent"]["policies"])
    reservoir_buffers = MultiAgentReservoirBuffer(
        reservoir_size, config["multiagent"]["policies"])
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # 2. define store operations
    store_op = rollouts.for_each(
        StoreToBuffers(replay_buffers, reservoir_buffers,
                       config['multiagent']['policies_to_train']))  # Sampling

    # 3. define replay/reservoir operations
    replay_op = SimpleLocalReplayMultiagent(replay_buffers, config["replay_train_batch_size"],
                                      config["replay_min_size_to_learn"],
                                      config["replay_train_every"]) \
        .for_each(TrainOneStep(workers))\
        .for_each(UpdateTargetNetwork(workers, config['dqn_policy']["target_network_update_freq"]))

    reservoir_op = LocalReservoirMultiagent(reservoir_buffers, config["reservoir_train_batch_size"],
                                            config["reservoir_min_size_to_learn"],
                                            config["reservoir_train_every"])\
        .for_each(TrainOneStep(workers))

    # 4. define main train loop
    train_op = Concurrently([replay_op, reservoir_op, store_op],
                            mode="round_robin")
    return LowMemoryMetricsReporting(train_op, workers, config)
コード例 #10
0
ファイル: qmix.py プロジェクト: wuisawesome/ray
    def execution_plan(
        workers: WorkerSet, config: TrainerConfigDict, **kwargs
    ) -> LocalIterator[dict]:
        assert (
            len(kwargs) == 0
        ), "QMIX execution_plan does NOT take any additional parameters"

        rollouts = ParallelRollouts(workers, mode="bulk_sync")
        replay_buffer = SimpleReplayBuffer(config["buffer_size"])

        store_op = rollouts.for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

        train_op = (
            Replay(local_buffer=replay_buffer)
            .combine(
                ConcatBatches(
                    min_batch_size=config["train_batch_size"],
                    count_steps_by=config["multiagent"]["count_steps_by"],
                )
            )
            .for_each(TrainOneStep(workers))
            .for_each(
                UpdateTargetNetwork(workers, config["target_network_update_freq"])
            )
        )

        merged_op = Concurrently(
            [store_op, train_op], mode="round_robin", output_indexes=[1]
        )

        return StandardMetricsReporting(merged_op, workers, config)
コード例 #11
0
    def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                       **kwargs) -> LocalIterator[dict]:
        assert (
            len(kwargs) == 0
        ), "Alpha zero execution_plan does NOT take any additional parameters"

        rollouts = ParallelRollouts(workers, mode="bulk_sync")

        if config["simple_optimizer"]:
            train_op = rollouts.combine(
                ConcatBatches(
                    min_batch_size=config["train_batch_size"],
                    count_steps_by=config["multiagent"]["count_steps_by"],
                )).for_each(
                    TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"]))
        else:
            replay_buffer = SimpleReplayBuffer(config["buffer_size"])

            store_op = rollouts.for_each(
                StoreToReplayBuffer(local_buffer=replay_buffer))

            replay_op = (Replay(local_buffer=replay_buffer).filter(
                WaitUntilTimestepsElapsed(config["learning_starts"])).combine(
                    ConcatBatches(
                        min_batch_size=config["train_batch_size"],
                        count_steps_by=config["multiagent"]["count_steps_by"],
                    )).for_each(
                        TrainOneStep(workers,
                                     num_sgd_iter=config["num_sgd_iter"])))

            train_op = Concurrently([store_op, replay_op],
                                    mode="round_robin",
                                    output_indexes=[1])

        return StandardMetricsReporting(train_op, workers, config)
コード例 #12
0
def gather_experiences_directly(workers, config):
    rollouts = ParallelRollouts(
        workers,
        mode="async",
        num_async=config["max_requests_in_flight_per_sampler_worker"],
    )

    # Augment with replay and concat to desired train batch size.
    train_batches = (
        rollouts.for_each(lambda batch: batch.decompress_if_needed())
        .for_each(
            MixInReplay(
                num_slots=config["replay_buffer_num_slots"],
                replay_proportion=config["replay_proportion"],
            )
        )
        .flatten()
        .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )
        )
    )

    return train_batches
コード例 #13
0
def test_standardize(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="async")
    b = a.for_each(StandardizeFields(["t"]))
    batch = next(b)
    assert abs(np.mean(batch["t"])) < 0.001, batch
    assert abs(np.std(batch["t"]) - 1.0) < 0.001, batch
コード例 #14
0
def test_avg_gradients(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(ComputeGradients(workers)).batch(4)
    c = b.for_each(AverageGradients())
    grads, counts = next(c)
    assert counts == 400, counts
コード例 #15
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"])

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # (1) Generate rollouts and store them in our local replay buffer.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(TrainOneStep(workers)) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
コード例 #16
0
ファイル: dqn.py プロジェクト: zhangjiekui/ray
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        multiagent_sync_replay=config.get("multiagent_sync_replay"),
        **prio_args)

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                prio_dict[policy_id] = (samples.policy_batches[policy_id]
                                        .data.get("batch_indexes"), td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(TrainOneStep(workers)) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2). Only return the output
    # of (2) since training metrics are not available until (2) runs.
    train_op = Concurrently(
        [store_op, replay_op], mode="round_robin", output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
コード例 #17
0
def test_compute_gradients(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(ComputeGradients(workers))
    grads, counts = next(b)
    assert counts == 100, counts
    timers = a.shared_metrics.get().timers
    assert "compute_grads" in timers
コード例 #18
0
def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # Collect large batches of relevant experiences & standardize.
    rollouts = rollouts.for_each(
        SelectExperiences(workers.trainable_policies()))
    rollouts = rollouts.combine(
        ConcatBatches(min_batch_size=config["train_batch_size"]))
    rollouts = rollouts.for_each(StandardizeFields(["advantages"]))

    if config["simple_optimizer"]:
        train_op = rollouts.for_each(
            TrainOneStep(workers,
                         num_sgd_iter=config["num_sgd_iter"],
                         sgd_minibatch_size=config["sgd_minibatch_size"]))
    else:
        train_op = rollouts.for_each(
            TrainTFMultiGPU(
                workers,
                sgd_minibatch_size=config["sgd_minibatch_size"],
                num_sgd_iter=config["num_sgd_iter"],
                num_gpus=config["num_gpus"],
                rollout_fragment_length=config["rollout_fragment_length"],
                num_envs_per_worker=config["num_envs_per_worker"],
                train_batch_size=config["train_batch_size"],
                shuffle_sequences=config["shuffle_sequences"],
                _fake_gpus=config["_fake_gpus"]))

    # Callback to update the KL based on optimization info.
    def update_kl(item):
        _, fetches = item

        def update(pi, pi_id):
            if pi_id in fetches:
                pi.update_kl(fetches[pi_id]["kl"])
            else:
                logger.warning("No data for {}, not updating kl".format(pi_id))

        workers.local_worker().foreach_trainable_policy(update)

    # Update KL after each round of training.
    train_op = train_op.for_each(update_kl)

    return StandardMetricsReporting(train_op, workers, config) \
        .for_each(lambda result: _warn_about_bad_reward_scales(config, result))
コード例 #19
0
ファイル: ppo.py プロジェクト: ijrsvt/ray
    def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                       **kwargs) -> LocalIterator[dict]:
        assert (len(kwargs) == 0
                ), "PPO execution_plan does NOT take any additional parameters"

        rollouts = ParallelRollouts(workers, mode="bulk_sync")

        # Collect batches for the trainable policies.
        rollouts = rollouts.for_each(
            SelectExperiences(local_worker=workers.local_worker()))
        # Concatenate the SampleBatches into one.
        rollouts = rollouts.combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            ))
        # Standardize advantages.
        rollouts = rollouts.for_each(StandardizeFields(["advantages"]))

        # Perform one training step on the combined + standardized batch.
        if config["simple_optimizer"]:
            train_op = rollouts.for_each(
                TrainOneStep(
                    workers,
                    num_sgd_iter=config["num_sgd_iter"],
                    sgd_minibatch_size=config["sgd_minibatch_size"],
                ))
        else:
            train_op = rollouts.for_each(
                MultiGPUTrainOneStep(
                    workers=workers,
                    sgd_minibatch_size=config["sgd_minibatch_size"],
                    num_sgd_iter=config["num_sgd_iter"],
                    num_gpus=config["num_gpus"],
                    _fake_gpus=config["_fake_gpus"],
                ))

        # Update KL after each round of training.
        train_op = train_op.for_each(lambda t: t[1]).for_each(
            UpdateKL(workers))

        # Warn about bad reward scales and return training metrics.
        return StandardMetricsReporting(train_op, workers, config).for_each(
            lambda result: warn_about_bad_reward_scales(config, result))
コード例 #20
0
def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                   **kwargs) -> LocalIterator[dict]:
    rollouts = ParallelRollouts(workers, mode="async")

    # Collect batches for the trainable policies.
    rollouts = rollouts.for_each(
        SelectExperiences(local_worker=workers.local_worker()))

    # Return training metrics.
    return StandardMetricsReporting(rollouts, workers, config)
コード例 #21
0
def execution_plan(trainer: Trainer, workers: WorkerSet,
                   config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]:
    """Execution plan of the Simple Q algorithm. Defines the distributed dataflow.

    Args:
        trainer (Trainer): The Trainer object creating the execution plan.
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"])
    # Assign to Trainer, so we can store the LocalReplayBuffer's
    # data when we save checkpoints.
    trainer.local_replay_buffer = local_replay_buffer

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # (1) Generate rollouts and store them in our local replay buffer.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(train_step_op) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
コード例 #22
0
ファイル: cql.py プロジェクト: zivzone/ray
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        **prio_args)

    global replay_buffer
    replay_buffer = local_replay_buffer

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    store_op = rollouts.for_each(
        NoOpReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                prio_dict[policy_id] = (
                    samples.policy_batches[policy_id].get("batch_indexes"),
                    td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(TrainOneStep(workers)) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1],
                            round_robin_weights=calculate_rr_weights(config))

    return StandardMetricsReporting(train_op, workers, config)
コード例 #23
0
ファイル: test_execution.py プロジェクト: wolegechu/ray
def test_train_one_step(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(TrainOneStep(workers))
    assert "learner_stats" in next(b)
    counters = a.shared_metrics.get().counters
    assert counters["num_steps_sampled"] == 100, counters
    assert counters["num_steps_trained"] == 100, counters
    timers = a.shared_metrics.get().timers
    assert "learn" in timers
    workers.stop()
コード例 #24
0
def test_train_one_step(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(TrainOneStep(workers))
    batch, stats = next(b)
    assert isinstance(batch, SampleBatch)
    assert "default_policy" in stats
    assert "learner_stats" in stats["default_policy"]
    counters = a.shared_metrics.get().counters
    assert counters["num_steps_sampled"] == 100, counters
    assert counters["num_steps_trained"] == 100, counters
    timers = a.shared_metrics.get().timers
    assert "learn" in timers
    workers.stop()
コード例 #25
0
ファイル: off_policy.py プロジェクト: radovankavicky/raylab
def off_policy_execution_plan(workers: WorkerSet, config: TrainerConfigDict):
    """RLlib's default execution plan with an added warmup phase."""
    # Collects experiences in parallel from multiple RolloutWorker actors.
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    # On the first iteration, combine experience batches until we hit `learning_starts`
    # in size.
    rollouts = rollouts.combine(
        LearningStarts(learning_starts=config["learning_starts"]))
    # Then, train the policy on those experiences and update the workers.
    train_op = rollouts.for_each(TrainOneStep(workers))

    # Add on the standard episode reward, etc. metrics reporting. This returns
    # a LocalIterator[metrics_dict] representing metrics for each train step.
    return StandardMetricsReporting(train_op, workers, config)
コード例 #26
0
ファイル: test_execution.py プロジェクト: stjordanis/ray
def test_train_one_step(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(TrainOneStep(workers))
    batch, stats = next(b)
    assert isinstance(batch, SampleBatch)
    assert DEFAULT_POLICY_ID in stats
    assert "learner_stats" in stats[DEFAULT_POLICY_ID]
    counters = a.shared_metrics.get().counters
    assert counters[STEPS_SAMPLED_COUNTER] == 100, counters
    assert counters[STEPS_TRAINED_COUNTER] == 100, counters
    timers = a.shared_metrics.get().timers
    assert "learn" in timers
    workers.stop()
コード例 #27
0
ファイル: slateq.py プロジェクト: zivzone/ray
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the SlateQ algorithm. Defines the distributed dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"],
    )

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(TrainOneStep(workers))

    if config["slateq_strategy"] != "RANDOM":
        # Alternate deterministically between (1) and (2). Only return the
        # output of (2) since training metrics are not available until (2)
        # runs.
        train_op = Concurrently(
            [store_op, replay_op],
            mode="round_robin",
            output_indexes=[1],
            round_robin_weights=calculate_round_robin_weights(config))
    else:
        # No training is needed for the RANDOM strategy.
        train_op = rollouts

    return StandardMetricsReporting(train_op, workers, config)
コード例 #28
0
def execution_plan(workers, config):
    # Special Replay Buffer for Dreamer agent
    episode_buffer = EpisodicBuffer(length=config["batch_length"])

    local_worker = workers.local_worker()

    # Prefill episode buffer with initial exploration (uniform sampling)
    while total_sampled_timesteps(local_worker) < config["prefill_timesteps"]:
        samples = local_worker.sample()
        episode_buffer.add(samples)

    batch_size = config["batch_size"]
    dreamer_train_iters = config["dreamer_train_iters"]
    act_repeat = config["action_repeat"]

    rollouts = ParallelRollouts(workers)
    rollouts = rollouts.for_each(
        DreamerIteration(local_worker, episode_buffer, dreamer_train_iters,
                         batch_size, act_repeat))
    return rollouts
コード例 #29
0
ファイル: test_execution.py プロジェクト: stjordanis/ray
def test_store_to_replay_local(ray_start_regular_shared):
    buf = MultiAgentReplayBuffer(num_shards=1,
                                 learning_starts=200,
                                 capacity=1000,
                                 replay_batch_size=100,
                                 prioritized_replay_alpha=0.6,
                                 prioritized_replay_beta=0.4,
                                 prioritized_replay_eps=0.0001)
    assert buf.replay() is None

    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(StoreToReplayBuffer(local_buffer=buf))

    next(b)
    assert buf.replay() is None  # learning hasn't started yet
    next(b)
    assert buf.replay().count == 100

    replay_op = Replay(local_buffer=buf)
    assert next(replay_op).count == 100
コード例 #30
0
ファイル: test_execution.py プロジェクト: stjordanis/ray
def test_store_to_replay_actor(ray_start_regular_shared):
    actor = ReplayActor.remote(num_shards=1,
                               learning_starts=200,
                               buffer_size=1000,
                               replay_batch_size=100,
                               prioritized_replay_alpha=0.6,
                               prioritized_replay_beta=0.4,
                               prioritized_replay_eps=0.0001)
    assert ray.get(actor.replay.remote()) is None

    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(StoreToReplayBuffer(actors=[actor]))

    next(b)
    assert ray.get(actor.replay.remote()) is None  # learning hasn't started
    next(b)
    assert ray.get(actor.replay.remote()).count == 100

    replay_op = Replay(actors=[actor])
    assert next(replay_op).count == 100