Exemple #1
0
def execution_plan(workers, config):
    # For A3C, compute policy gradients remotely on the rollout workers.
    # rollouts = ParallelRollouts(workers, mode="bulk_sync")

    grads = AsyncGradients(workers)
    
    # Apply the gradients as they arrive. We set update_all to False so that
    # only the worker sending the gradient is updated with new weights.
    #train_op = grads.for_each(ApplyGradients(workers, update_all=False))
    print("_____")
    print(workers)
    temp1 = workers
    temp2 = workers
    rem1 = workers.remote_workers()[0:6]
    rem2 = workers.remote_workers()[6:11]
    temp1.reset(rem1)
    temp2.reset(rem2)
  
    rollouts1 = ParallelRollouts(temp1, mode="bulk_sync")
    rollouts2 = ParallelRollouts(temp2, mode="bulk_sync")


    train_step_op1 = TrainTFMultiGPU(
                workers=temp1,
                sgd_minibatch_size=config["train_batch_size"],
                num_sgd_iter=1,
                num_gpus=config["num_gpus"],
                shuffle_sequences=True,
                _fake_gpus=config["_fake_gpus"],
                framework=config.get("framework"))

    train_step_op2 = TrainTFMultiGPU(
                    workers=temp2,
                    sgd_minibatch_size=config["train_batch_size"],
                    num_sgd_iter=1,
                    num_gpus=config["num_gpus"],
                    shuffle_sequences=True,
                    _fake_gpus=config["_fake_gpus"],
                    framework=config.get("framework"))

    train_op1 = rollouts1.combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"][
                    "count_steps_by"])).for_each(train_step_op1)
    train_op2 = rollouts2.combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"][
                    "count_steps_by"])).for_each(train_step_op2)
    
    #train_op = grads.for_each(ApplyGradients(workers, update_all=False))
    
    
    return StandardMetricsReporting(train_op1, temp1, config).union(StandardMetricsReporting(train_op2, temp2, config))
Exemple #2
0
def test_rollouts(ray_start_regular_shared):
    workers = make_workers(2)
    a = ParallelRollouts(workers, mode="bulk_sync")
    assert next(a).count == 200
    counters = a.shared_metrics.get().counters
    assert counters[STEPS_SAMPLED_COUNTER] == 200, counters
    a = ParallelRollouts(workers, mode="async")
    assert next(a).count == 100
    counters = a.shared_metrics.get().counters
    assert counters[STEPS_SAMPLED_COUNTER] == 100, counters
    workers.stop()
Exemple #3
0
def test_rollouts(ray_start_regular_shared):
    workers = make_workers(2)
    a = ParallelRollouts(workers, mode="bulk_sync")
    assert next(a).count == 200
    counters = a.shared_metrics.get().counters
    assert counters["num_steps_sampled"] == 200, counters
    a = ParallelRollouts(workers, mode="async")
    assert next(a).count == 100
    counters = a.shared_metrics.get().counters
    assert counters["num_steps_sampled"] == 100, counters
    workers.stop()
Exemple #4
0
    def test_store_to_replay_actor(self):
        ReplayActor = ray.remote(num_cpus=0)(MultiAgentReplayBuffer)
        actor = ReplayActor.remote(
            num_shards=1,
            learning_starts=200,
            capacity=1000,
            replay_batch_size=100,
            prioritized_replay_alpha=0.6,
            prioritized_replay_beta=0.4,
            prioritized_replay_eps=0.0001,
        )
        assert len(ray.get(actor.sample.remote(100))) == 0

        workers = make_workers(0)
        a = ParallelRollouts(workers, mode="bulk_sync")
        b = a.for_each(StoreToReplayBuffer(actors=[actor]))

        next(b)
        assert len(ray.get(
            actor.sample.remote(100))) == 0  # learning hasn't started
        next(b)
        assert ray.get(actor.sample.remote(100)).count == 100

        replay_op = Replay(actors=[actor])
        assert next(replay_op).count == 100
Exemple #5
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"])

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # (1) Generate rollouts and store them in our local replay buffer.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(TrainOneStep(workers)) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Exemple #6
0
def execution_plan_nfsp(workers, config):
    # 1. define buffers
    replay_size = config["replay_buffer_size"]
    reservoir_size = config["reservoir_buffer_size"]
    replay_buffers = MultiAgentSimpleReplayBuffer(
        replay_size, config["multiagent"]["policies"])
    reservoir_buffers = MultiAgentReservoirBuffer(
        reservoir_size, config["multiagent"]["policies"])
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # 2. define store operations
    store_op = rollouts.for_each(
        StoreToBuffers(replay_buffers, reservoir_buffers,
                       config['multiagent']['policies_to_train']))  # Sampling

    # 3. define replay/reservoir operations
    replay_op = SimpleLocalReplayMultiagent(replay_buffers, config["replay_train_batch_size"],
                                      config["replay_min_size_to_learn"],
                                      config["replay_train_every"]) \
        .for_each(TrainOneStep(workers))\
        .for_each(UpdateTargetNetwork(workers, config['dqn_policy']["target_network_update_freq"]))

    reservoir_op = LocalReservoirMultiagent(reservoir_buffers, config["reservoir_train_batch_size"],
                                            config["reservoir_min_size_to_learn"],
                                            config["reservoir_train_every"])\
        .for_each(TrainOneStep(workers))

    # 4. define main train loop
    train_op = Concurrently([replay_op, reservoir_op, store_op],
                            mode="round_robin")
    return LowMemoryMetricsReporting(train_op, workers, config)
Exemple #7
0
    def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                       **kwargs) -> LocalIterator[dict]:
        assert (
            "local_replay_buffer"
            in kwargs), "SlateQ execution plan requires a local replay buffer."

        rollouts = ParallelRollouts(workers, mode="bulk_sync")

        # We execute the following steps concurrently:
        # (1) Generate rollouts and store them in our local replay buffer.
        # Calling next() on store_op drives this.
        store_op = rollouts.for_each(
            StoreToReplayBuffer(local_buffer=kwargs["local_replay_buffer"]))

        # (2) Read and train on experiences from the replay buffer. Every batch
        # returned from the LocalReplay() iterator is passed to TrainOneStep to
        # take a SGD step.
        replay_op = (Replay(
            local_buffer=kwargs["local_replay_buffer"]).for_each(
                TrainOneStep(workers)).for_each(
                    UpdateTargetNetwork(workers,
                                        config["target_network_update_freq"])))

        # Alternate deterministically between (1) and (2). Only return the
        # output of (2) since training metrics are not available until (2)
        # runs.
        train_op = Concurrently(
            [store_op, replay_op],
            mode="round_robin",
            output_indexes=[1],
            round_robin_weights=calculate_round_robin_weights(config),
        )

        return StandardMetricsReporting(train_op, workers, config)
Exemple #8
0
def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # Collect large batches of relevant experiences & standardize.
    rollouts = rollouts.for_each(
        SelectExperiences(workers.trainable_policies()))
    rollouts = rollouts.combine(
        ConcatBatches(min_batch_size=config["train_batch_size"]))
    rollouts = rollouts.for_each(StandardizeFields(["advantages"]))

    if config["simple_optimizer"]:
        train_op = rollouts.for_each(
            TrainOneStep(
                workers,
                num_sgd_iter=config["num_sgd_iter"],
                sgd_minibatch_size=config["sgd_minibatch_size"]))
    else:
        train_op = rollouts.for_each(
            TrainTFMultiGPU(
                workers,
                sgd_minibatch_size=config["sgd_minibatch_size"],
                num_sgd_iter=config["num_sgd_iter"],
                num_gpus=config["num_gpus"],
                rollout_fragment_length=config["rollout_fragment_length"],
                num_envs_per_worker=config["num_envs_per_worker"],
                train_batch_size=config["train_batch_size"],
                shuffle_sequences=config["shuffle_sequences"],
                _fake_gpus=config["_fake_gpus"]))

    # Update KL after each round of training.
    train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers))

    return StandardMetricsReporting(train_op, workers, config) \
        .for_each(lambda result: warn_about_bad_reward_scales(config, result))
Exemple #9
0
def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    if config["microbatch_size"]:
        num_microbatches = math.ceil(
            config["train_batch_size"] / config["microbatch_size"])
        # In microbatch mode, we want to compute gradients on experience
        # microbatches, average a number of these microbatches, and then apply
        # the averaged gradient in one SGD step. This conserves GPU memory,
        # allowing for extremely large experience batches to be used.
        train_op = (
            rollouts.combine(
                ConcatBatches(min_batch_size=config["microbatch_size"]))
            .for_each(ComputeGradients(workers))  # (grads, info)
            .batch(num_microbatches)  # List[(grads, info)]
            .for_each(AverageGradients())  # (avg_grads, info)
            .for_each(ApplyGradients(workers)))
    else:
        # In normal mode, we execute one SGD step per each train batch.
        train_op = rollouts \
            .combine(ConcatBatches(
                min_batch_size=config["train_batch_size"])) \
            .for_each(TrainOneStep(workers))

    return StandardMetricsReporting(train_op, workers, config)
Exemple #10
0
def gather_experiences_directly(workers, config):
    rollouts = ParallelRollouts(
        workers,
        mode="async",
        num_async=config["max_requests_in_flight_per_sampler_worker"],
    )

    # Augment with replay and concat to desired train batch size.
    train_batches = (
        rollouts.for_each(lambda batch: batch.decompress_if_needed())
        .for_each(
            MixInReplay(
                num_slots=config["replay_buffer_num_slots"],
                replay_proportion=config["replay_proportion"],
            )
        )
        .flatten()
        .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )
        )
    )

    return train_batches
Exemple #11
0
def evaluate(trainer, num_episodes=20):
    ret_reward = []
    ret_length = []
    ret_success_rate = []
    ret_out_rate = []
    ret_crash_rate = []
    start = time.time()
    episode_count = 0
    while episode_count < num_episodes:
        rollouts = ParallelRollouts(trainer.workers, mode="bulk_sync")
        batch = next(rollouts)
        episodes = batch.split_by_episode()

        ret_reward.extend([e["rewards"].sum() for e in episodes])
        ret_length.extend([e.count for e in episodes])
        ret_success_rate.extend([e["infos"][-1]["arrive_dest"] for e in episodes])
        ret_out_rate.extend([e["infos"][-1]["out_of_road"] for e in episodes])
        ret_crash_rate.extend([e["infos"][-1]["crash"] for e in episodes])

        episode_count += len(episodes)
        print("Finish {} episodes".format(episode_count))

    ret = dict(
        reward=np.mean(ret_reward),
        length=np.mean(ret_length),
        success_rate=np.mean(ret_success_rate),
        out_rate=np.mean(ret_out_rate),
        crash_rate=np.mean(ret_crash_rate)
    )
    print("We collected {} episodes. Spent: {:.3f} s.\nResult: {}".format(
        episode_count, time.time() - start, {k: round(v, 3) for k, v in ret.items()}
    ))
    return ret
Exemple #12
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the MARWIL/BC algorithm. Defines the distributed
    dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    replay_buffer = SimpleReplayBuffer(config["replay_buffer_size"])

    store_op = rollouts \
        .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

    replay_op = Replay(local_buffer=replay_buffer) \
        .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )) \
        .for_each(TrainOneStep(workers))

    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Exemple #13
0
 def test_standardize(self):
     workers = make_workers(0)
     a = ParallelRollouts(workers, mode="async")
     b = a.for_each(StandardizeFields([SampleBatch.EPS_ID]))
     batch = next(b)
     assert abs(np.mean(batch[SampleBatch.EPS_ID])) < 0.001, batch
     assert abs(np.std(batch[SampleBatch.EPS_ID]) - 1.0) < 0.001, batch
Exemple #14
0
 def test_rollouts_local(self):
     workers = make_workers(0)
     a = ParallelRollouts(workers, mode="bulk_sync")
     assert next(a).count == 100
     counters = a.shared_metrics.get().counters
     assert counters[STEPS_SAMPLED_COUNTER] == 100, counters
     workers.stop()
Exemple #15
0
    def execution_plan(
        workers: WorkerSet, config: TrainerConfigDict, **kwargs
    ) -> LocalIterator[dict]:
        assert (
            len(kwargs) == 0
        ), "QMIX execution_plan does NOT take any additional parameters"

        rollouts = ParallelRollouts(workers, mode="bulk_sync")
        replay_buffer = SimpleReplayBuffer(config["buffer_size"])

        store_op = rollouts.for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

        train_op = (
            Replay(local_buffer=replay_buffer)
            .combine(
                ConcatBatches(
                    min_batch_size=config["train_batch_size"],
                    count_steps_by=config["multiagent"]["count_steps_by"],
                )
            )
            .for_each(TrainOneStep(workers))
            .for_each(
                UpdateTargetNetwork(workers, config["target_network_update_freq"])
            )
        )

        merged_op = Concurrently(
            [store_op, train_op], mode="round_robin", output_indexes=[1]
        )

        return StandardMetricsReporting(merged_op, workers, config)
Exemple #16
0
def default_execution_plan(workers: WorkerSet, config: TrainerConfigDict):
    # Collects experiences in parallel from multiple RolloutWorker actors.
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # Combine experiences batches until we hit `train_batch_size` in size.
    # Then, train the policy on those experiences and update the workers.
    train_op = rollouts.combine(
        ConcatBatches(
            min_batch_size=config["train_batch_size"],
            count_steps_by=config["multiagent"]["count_steps_by"],
        ))

    if config.get("simple_optimizer") is True:
        train_op = train_op.for_each(TrainOneStep(workers))
    else:
        train_op = train_op.for_each(
            MultiGPUTrainOneStep(
                workers=workers,
                sgd_minibatch_size=config.get("sgd_minibatch_size",
                                              config["train_batch_size"]),
                num_sgd_iter=config.get("num_sgd_iter", 1),
                num_gpus=config["num_gpus"],
                shuffle_sequences=config.get("shuffle_sequences", False),
                _fake_gpus=config["_fake_gpus"],
                framework=config["framework"]))

    # Add on the standard episode reward, etc. metrics reporting. This returns
    # a LocalIterator[metrics_dict] representing metrics for each train step.
    return StandardMetricsReporting(train_op, workers, config)
Exemple #17
0
    def execution_plan(workers, config, **kwargs):
        assert (
            len(kwargs) == 0
        ), "Dreamer execution_plan does NOT take any additional parameters"

        # Special replay buffer for Dreamer agent.
        episode_buffer = EpisodicBuffer(length=config["batch_length"])

        local_worker = workers.local_worker()

        # Prefill episode buffer with initial exploration (uniform sampling)
        while total_sampled_timesteps(
                local_worker) < config["prefill_timesteps"]:
            samples = local_worker.sample()
            episode_buffer.add(samples)

        batch_size = config["batch_size"]
        dreamer_train_iters = config["dreamer_train_iters"]
        act_repeat = config["action_repeat"]

        rollouts = ParallelRollouts(workers)
        rollouts = rollouts.for_each(
            DreamerIteration(
                local_worker,
                episode_buffer,
                dreamer_train_iters,
                batch_size,
                act_repeat,
            ))
        return rollouts
Exemple #18
0
def test_avg_gradients(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(ComputeGradients(workers)).batch(4)
    c = b.for_each(AverageGradients())
    grads, counts = next(c)
    assert counts == 400, counts
Exemple #19
0
    def execution_plan(workers, config, **kwargs):
        assert "local_replay_buffer" in kwargs, (
            "GenericOffPolicy execution plan requires a local replay buffer.")

        local_replay_buffer = kwargs["local_replay_buffer"]

        rollouts = ParallelRollouts(workers, mode="bulk_sync")

        # (1) Generate rollouts and store them in our local replay buffer.
        store_op = rollouts.for_each(
            StoreToReplayBuffer(local_buffer=local_replay_buffer))

        if config["simple_optimizer"]:
            train_step_op = TrainOneStep(workers)
        else:
            train_step_op = MultiGPUTrainOneStep(
                workers=workers,
                sgd_minibatch_size=config["train_batch_size"],
                num_sgd_iter=1,
                num_gpus=config["num_gpus"],
                _fake_gpus=config["_fake_gpus"])

        # (2) Read and train on experiences from the replay buffer.
        replay_op = Replay(local_buffer=local_replay_buffer) \
            .for_each(train_step_op) \
            .for_each(UpdateTargetNetwork(
                workers, config["target_network_update_freq"]))

        # Alternate deterministically between (1) and (2).
        train_op = Concurrently([store_op, replay_op],
                                mode="round_robin",
                                output_indexes=[1])

        return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config, **kwargs):
    assert len(kwargs) == 0, (
        "Alpha zero execution_plan does NOT take any additional parameters")

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    if config["simple_optimizer"]:
        train_op = rollouts.combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )).for_each(
                TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"]))
    else:
        replay_buffer = SimpleReplayBuffer(config["buffer_size"])

        store_op = rollouts \
            .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

        replay_op = Replay(local_buffer=replay_buffer) \
            .filter(WaitUntilTimestepsElapsed(config["learning_starts"])) \
            .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )) \
            .for_each(TrainOneStep(
                workers, num_sgd_iter=config["num_sgd_iter"]))

        train_op = Concurrently(
            [store_op, replay_op], mode="round_robin", output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Exemple #21
0
def test_standardize(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="async")
    b = a.for_each(StandardizeFields(["t"]))
    batch = next(b)
    assert abs(np.mean(batch["t"])) < 0.001, batch
    assert abs(np.std(batch["t"]) - 1.0) < 0.001, batch
Exemple #22
0
    def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                       **kwargs) -> LocalIterator[dict]:
        assert len(kwargs) == 0, (
            "Marwill execution_plan does NOT take any additional parameters")

        rollouts = ParallelRollouts(workers, mode="bulk_sync")
        replay_buffer = MultiAgentReplayBuffer(
            learning_starts=config["learning_starts"],
            capacity=config["replay_buffer_size"],
            replay_batch_size=config["train_batch_size"],
            replay_sequence_length=1,
        )

        store_op = rollouts \
            .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

        replay_op = Replay(local_buffer=replay_buffer) \
            .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )) \
            .for_each(TrainOneStep(workers))

        train_op = Concurrently([store_op, replay_op],
                                mode="round_robin",
                                output_indexes=[1])

        return StandardMetricsReporting(train_op, workers, config)
Exemple #23
0
def test_concat_batches(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="async")
    b = a.combine(ConcatBatches(1000))
    assert next(b).count == 1000
    timers = b.shared_metrics.get().timers
    assert "sample" in timers
def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    if config["simple_optimizer"]:
        train_op = rollouts \
            .combine(ConcatBatches(
                min_batch_size=config["train_batch_size"])) \
            .for_each(TrainOneStep(
                workers, num_sgd_iter=config["num_sgd_iter"]))
    else:
        replay_buffer = SimpleReplayBuffer(config["buffer_size"])

        store_op = rollouts \
            .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

        replay_op = Replay(local_buffer=replay_buffer) \
            .filter(WaitUntilTimestepsElapsed(config["learning_starts"])) \
            .combine(
                ConcatBatches(min_batch_size=config["train_batch_size"])) \
            .for_each(TrainOneStep(
                workers, num_sgd_iter=config["num_sgd_iter"]))

        train_op = Concurrently(
            [store_op, replay_op], mode="round_robin", output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Exemple #25
0
def gather_experiences_tree_aggregation(workers, config):
    """Tree aggregation version of gather_experiences_directly()."""

    rollouts = ParallelRollouts(workers, mode="raw")

    # Divide up the workers between aggregators.
    worker_assignments = [[] for _ in range(config["num_aggregation_workers"])]
    i = 0
    for w in range(len(workers.remote_workers())):
        worker_assignments[i].append(w)
        i += 1
        i %= len(worker_assignments)
    logger.info("Worker assignments: {}".format(worker_assignments))

    # Create parallel iterators that represent each aggregation group.
    rollout_groups: List["ParallelIterator[SampleBatchType]"] = [
        rollouts.select_shards(assigned) for assigned in worker_assignments
    ]

    # This spawns |num_aggregation_workers| intermediate actors that aggregate
    # experiences in parallel. We force colocation on the same node to maximize
    # data bandwidth between them and the driver.
    train_batches = from_actors([
        create_colocated(Aggregator, [config, g], 1)[0] for g in rollout_groups
    ])

    # TODO(ekl) properly account for replay.
    def record_steps_sampled(batch):
        metrics = _get_shared_metrics()
        metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count
        return batch

    return train_batches.gather_async().for_each(record_steps_sampled)
Exemple #26
0
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        multiagent_sync_replay=config.get("multiagent_sync_replay"),
        **prio_args)

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                prio_dict[policy_id] = (samples.policy_batches[policy_id]
                                        .data.get("batch_indexes"), td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(TrainOneStep(workers)) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2). Only return the output
    # of (2) since training metrics are not available until (2) runs.
    train_op = Concurrently(
        [store_op, replay_op], mode="round_robin", output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Exemple #27
0
def test_compute_gradients(ray_start_regular_shared):
    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(ComputeGradients(workers))
    grads, counts = next(b)
    assert counts == 100, counts
    timers = a.shared_metrics.get().timers
    assert "compute_grads" in timers
Exemple #28
0
def custom_training_workflow(workers: WorkerSet, config: dict):
    local_replay_buffer = LocalReplayBuffer(num_shards=1,
                                            learning_starts=1000,
                                            buffer_size=50000,
                                            replay_batch_size=64)

    def add_ppo_metrics(batch):
        print("PPO policy learning on samples from",
              batch.policy_batches.keys(), "env steps", batch.env_steps(),
              "agent steps", batch.env_steps())
        metrics = _get_shared_metrics()
        metrics.counters["agent_steps_trained_PPO"] += batch.env_steps()
        return batch

    def add_dqn_metrics(batch):
        print("DQN policy learning on samples from",
              batch.policy_batches.keys(), "env steps", batch.env_steps(),
              "agent steps", batch.env_steps())
        metrics = _get_shared_metrics()
        metrics.counters["agent_steps_trained_DQN"] += batch.env_steps()
        return batch

    # Generate common experiences.
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    r1, r2 = rollouts.duplicate(n=2)

    # DQN sub-flow.
    dqn_store_op = r1.for_each(SelectExperiences(["dqn_policy"])) \
        .for_each(
            StoreToReplayBuffer(local_buffer=local_replay_buffer))
    dqn_replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(add_dqn_metrics) \
        .for_each(TrainOneStep(workers, policies=["dqn_policy"])) \
        .for_each(UpdateTargetNetwork(
            workers, target_update_freq=500, policies=["dqn_policy"]))
    dqn_train_op = Concurrently([dqn_store_op, dqn_replay_op],
                                mode="round_robin",
                                output_indexes=[1])

    # PPO sub-flow.
    ppo_train_op = r2.for_each(SelectExperiences(["ppo_policy"])) \
        .combine(ConcatBatches(
            min_batch_size=200, count_steps_by="env_steps")) \
        .for_each(add_ppo_metrics) \
        .for_each(StandardizeFields(["advantages"])) \
        .for_each(TrainOneStep(
            workers,
            policies=["ppo_policy"],
            num_sgd_iter=10,
            sgd_minibatch_size=128))

    # Combined training flow
    train_op = Concurrently([ppo_train_op, dqn_train_op],
                            mode="async",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                   **kwargs) -> LocalIterator[dict]:
    rollouts = ParallelRollouts(workers, mode="async")

    # Collect batches for the trainable policies.
    rollouts = rollouts.for_each(
        SelectExperiences(local_worker=workers.local_worker()))

    # Return training metrics.
    return StandardMetricsReporting(rollouts, workers, config)
Exemple #30
0
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        **prio_args)

    global replay_buffer
    replay_buffer = local_replay_buffer

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    store_op = rollouts.for_each(
        NoOpReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                prio_dict[policy_id] = (
                    samples.policy_batches[policy_id].get("batch_indexes"),
                    td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(TrainOneStep(workers)) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1],
                            round_robin_weights=calculate_rr_weights(config))

    return StandardMetricsReporting(train_op, workers, config)