Example #1
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"])

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # (1) Generate rollouts and store them in our local replay buffer.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(TrainOneStep(workers)) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Example #2
0
def custom_training_workflow(workers: WorkerSet, config: dict):
    local_replay_buffer = LocalReplayBuffer(num_shards=1,
                                            learning_starts=1000,
                                            buffer_size=50000,
                                            replay_batch_size=64)

    def add_ppo_metrics(batch):
        print("PPO policy learning on samples from",
              batch.policy_batches.keys(), "env steps", batch.env_steps(),
              "agent steps", batch.env_steps())
        metrics = _get_shared_metrics()
        metrics.counters["agent_steps_trained_PPO"] += batch.env_steps()
        return batch

    def add_dqn_metrics(batch):
        print("DQN policy learning on samples from",
              batch.policy_batches.keys(), "env steps", batch.env_steps(),
              "agent steps", batch.env_steps())
        metrics = _get_shared_metrics()
        metrics.counters["agent_steps_trained_DQN"] += batch.env_steps()
        return batch

    # Generate common experiences.
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    r1, r2 = rollouts.duplicate(n=2)

    # DQN sub-flow.
    dqn_store_op = r1.for_each(SelectExperiences(["dqn_policy"])) \
        .for_each(
            StoreToReplayBuffer(local_buffer=local_replay_buffer))
    dqn_replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(add_dqn_metrics) \
        .for_each(TrainOneStep(workers, policies=["dqn_policy"])) \
        .for_each(UpdateTargetNetwork(
            workers, target_update_freq=500, policies=["dqn_policy"]))
    dqn_train_op = Concurrently([dqn_store_op, dqn_replay_op],
                                mode="round_robin",
                                output_indexes=[1])

    # PPO sub-flow.
    ppo_train_op = r2.for_each(SelectExperiences(["ppo_policy"])) \
        .combine(ConcatBatches(
            min_batch_size=200, count_steps_by="env_steps")) \
        .for_each(add_ppo_metrics) \
        .for_each(StandardizeFields(["advantages"])) \
        .for_each(TrainOneStep(
            workers,
            policies=["ppo_policy"],
            num_sgd_iter=10,
            sgd_minibatch_size=128))

    # Combined training flow
    train_op = Concurrently([ppo_train_op, dqn_train_op],
                            mode="async",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Example #3
0
def execution_plan(trainer: Trainer, workers: WorkerSet,
                   config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]:
    """Execution plan of the Simple Q algorithm. Defines the distributed dataflow.

    Args:
        trainer (Trainer): The Trainer object creating the execution plan.
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"])
    # Assign to Trainer, so we can store the LocalReplayBuffer's
    # data when we save checkpoints.
    trainer.local_replay_buffer = local_replay_buffer

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # (1) Generate rollouts and store them in our local replay buffer.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(train_step_op) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Example #4
0
File: cql.py Project: zivzone/ray
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        **prio_args)

    global replay_buffer
    replay_buffer = local_replay_buffer

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    store_op = rollouts.for_each(
        NoOpReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                prio_dict[policy_id] = (
                    samples.policy_batches[policy_id].get("batch_indexes"),
                    td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(TrainOneStep(workers)) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1],
                            round_robin_weights=calculate_rr_weights(config))

    return StandardMetricsReporting(train_op, workers, config)
Example #5
0
def test_store_to_replay_local(ray_start_regular_shared):
    buf = LocalReplayBuffer(num_shards=1,
                            learning_starts=200,
                            buffer_size=1000,
                            replay_batch_size=100,
                            prioritized_replay_alpha=0.6,
                            prioritized_replay_beta=0.4,
                            prioritized_replay_eps=0.0001)
    assert buf.replay() is None

    workers = make_workers(0)
    a = ParallelRollouts(workers, mode="bulk_sync")
    b = a.for_each(StoreToReplayBuffer(local_buffer=buf))

    next(b)
    assert buf.replay() is None  # learning hasn't started yet
    next(b)
    assert buf.replay().count == 100

    replay_op = Replay(local_buffer=buf)
    assert next(replay_op).count == 100
Example #6
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the SlateQ algorithm. Defines the distributed dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"],
    )

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(TrainOneStep(workers))

    if config["slateq_strategy"] != "RANDOM":
        # Alternate deterministically between (1) and (2). Only return the
        # output of (2) since training metrics are not available until (2)
        # runs.
        train_op = Concurrently(
            [store_op, replay_op],
            mode="round_robin",
            output_indexes=[1],
            round_robin_weights=calculate_round_robin_weights(config))
    else:
        # No training is needed for the RANDOM strategy.
        train_op = rollouts

    return StandardMetricsReporting(train_op, workers, config)
Example #7
0
def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                   **kwargs) -> LocalIterator[dict]:
    """Execution plan of the MARWIL/BC algorithm. Defines the distributed
    dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    assert len(kwargs) == 0, (
        "Marwill execution_plan does NOT take any additional parameters")

    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    replay_buffer = LocalReplayBuffer(
        learning_starts=config["learning_starts"],
        capacity=config["replay_buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_sequence_length=1,
    )

    store_op = rollouts \
        .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

    replay_op = Replay(local_buffer=replay_buffer) \
        .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )) \
        .for_each(TrainOneStep(workers))

    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Example #8
0
def execution_plan(workers, config):
    """The main execution plan is defined here. Since RLlib supports the main sequences
       in a typical RL job, its easy to define each step without any custom code involved."""
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config['learning_starts'],
        buffer_size=config['buffer_size'],
        replay_batch_size=config['train_batch_size'],
    )

    store_op = ParallelRollouts(workers, mode='bulk_sync') \
        .for_each(StoreToReplayBuffer(local_buffer=local_replay_buffer))

    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(TrainOneStep(workers)) \
        .for_each(UpdateTargetNetwork(workers, config['target_network_update_freq']))

    train_op = Concurrently([store_op, replay_op],
                            mode='round_robin',
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Example #9
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the Simple Q algorithm. Defines the distributed dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config["replay_sequence_length"])

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # (1) Generate rollouts and store them in our local replay buffer.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(TrainOneStep(workers)) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)
Example #10
0
    def test_sac_loss_function(self):
        """Tests SAC loss function results across all frameworks."""
        config = sac.DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        config["learning_starts"] = 0
        config["twin_q"] = False
        config["gamma"] = 0.99
        # Switch on deterministic loss so we can compare the loss values.
        config["_deterministic_loss"] = True
        # Use very simple nets.
        config["Q_model"]["fcnet_hiddens"] = [10]
        config["policy_model"]["fcnet_hiddens"] = [10]
        # Make sure, timing differences do not affect trainer.train().
        config["min_iter_time_s"] = 0
        # Test SAC with Simplex action space.
        config["env_config"] = {"simplex_actions": True}

        map_ = {
            # Action net.
            "default_policy/fc_1/kernel": "action_model._hidden_layers.0."
            "_model.0.weight",
            "default_policy/fc_1/bias": "action_model._hidden_layers.0."
            "_model.0.bias",
            "default_policy/fc_out/kernel": "action_model."
            "_logits._model.0.weight",
            "default_policy/fc_out/bias": "action_model._logits._model.0.bias",
            "default_policy/value_out/kernel": "action_model."
            "_value_branch._model.0.weight",
            "default_policy/value_out/bias": "action_model."
            "_value_branch._model.0.bias",
            # Q-net.
            "default_policy/fc_1_1/kernel": "q_net."
            "_hidden_layers.0._model.0.weight",
            "default_policy/fc_1_1/bias": "q_net."
            "_hidden_layers.0._model.0.bias",
            "default_policy/fc_out_1/kernel": "q_net._logits._model.0.weight",
            "default_policy/fc_out_1/bias": "q_net._logits._model.0.bias",
            "default_policy/value_out_1/kernel": "q_net."
            "_value_branch._model.0.weight",
            "default_policy/value_out_1/bias": "q_net."
            "_value_branch._model.0.bias",
            "default_policy/log_alpha": "log_alpha",
            # Target action-net.
            "default_policy/fc_1_2/kernel": "action_model."
            "_hidden_layers.0._model.0.weight",
            "default_policy/fc_1_2/bias": "action_model."
            "_hidden_layers.0._model.0.bias",
            "default_policy/fc_out_2/kernel": "action_model."
            "_logits._model.0.weight",
            "default_policy/fc_out_2/bias": "action_model."
            "_logits._model.0.bias",
            "default_policy/value_out_2/kernel": "action_model."
            "_value_branch._model.0.weight",
            "default_policy/value_out_2/bias": "action_model."
            "_value_branch._model.0.bias",
            # Target Q-net
            "default_policy/fc_1_3/kernel": "q_net."
            "_hidden_layers.0._model.0.weight",
            "default_policy/fc_1_3/bias": "q_net."
            "_hidden_layers.0._model.0.bias",
            "default_policy/fc_out_3/kernel": "q_net."
            "_logits._model.0.weight",
            "default_policy/fc_out_3/bias": "q_net."
            "_logits._model.0.bias",
            "default_policy/value_out_3/kernel": "q_net."
            "_value_branch._model.0.weight",
            "default_policy/value_out_3/bias": "q_net."
            "_value_branch._model.0.bias",
            "default_policy/log_alpha_1": "log_alpha",
        }

        env = SimpleEnv
        batch_size = 100
        obs_size = (batch_size, 1)
        actions = np.random.random(size=(batch_size, 2))

        # Batch of size=n.
        input_ = self._get_batch_helper(obs_size, actions, batch_size)

        # Simply compare loss values AND grads of all frameworks with each
        # other.
        prev_fw_loss = weights_dict = None
        expect_c, expect_a, expect_e, expect_t = None, None, None, None
        # History of tf-updated NN-weights over n training steps.
        tf_updated_weights = []
        # History of input batches used.
        tf_inputs = []
        for fw, sess in framework_iterator(config,
                                           frameworks=("tf", "torch"),
                                           session=True):
            # Generate Trainer and get its default Policy object.
            trainer = sac.SACTrainer(config=config, env=env)
            policy = trainer.get_policy()
            p_sess = None
            if sess:
                p_sess = policy.get_session()

            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
                # Start with the tf vars-dict.
                assert fw in ["tf2", "tf", "tfe"]
                weights_dict = policy.get_weights()
                if fw == "tfe":
                    log_alpha = weights_dict[10]
                    weights_dict = self._translate_tfe_weights(
                        weights_dict, map_)
            else:
                assert fw == "torch"  # Then transfer that to torch Model.
                model_dict = self._translate_weights_to_torch(
                    weights_dict, map_)
                # Have to add this here (not a parameter in tf, but must be
                # one in torch, so it gets properly copied to the GPU(s)).
                model_dict["target_entropy"] = policy.model.target_entropy
                policy.model.load_state_dict(model_dict)
                policy.target_model.load_state_dict(model_dict)

            if fw == "tf":
                log_alpha = weights_dict["default_policy/log_alpha"]
            elif fw == "torch":
                # Actually convert to torch tensors (by accessing everything).
                input_ = policy._lazy_tensor_dict(input_)
                input_ = {k: input_[k] for k in input_.keys()}
                log_alpha = policy.model.log_alpha.detach().cpu().numpy()[0]

            # Only run the expectation once, should be the same anyways
            # for all frameworks.
            if expect_c is None:
                expect_c, expect_a, expect_e, expect_t = \
                    self._sac_loss_helper(input_, weights_dict,
                                          sorted(weights_dict.keys()),
                                          log_alpha, fw,
                                          gamma=config["gamma"], sess=sess)

            # Get actual outs and compare to expectation AND previous
            # framework. c=critic, a=actor, e=entropy, t=td-error.
            if fw == "tf":
                c, a, e, t, tf_c_grads, tf_a_grads, tf_e_grads = \
                    p_sess.run([
                        policy.critic_loss,
                        policy.actor_loss,
                        policy.alpha_loss,
                        policy.td_error,
                        policy.optimizer().compute_gradients(
                            policy.critic_loss[0],
                            [v for v in policy.model.q_variables() if
                             "value_" not in v.name]),
                        policy.optimizer().compute_gradients(
                            policy.actor_loss,
                            [v for v in policy.model.policy_variables() if
                             "value_" not in v.name]),
                        policy.optimizer().compute_gradients(
                            policy.alpha_loss, policy.model.log_alpha)],
                        feed_dict=policy._get_loss_inputs_dict(
                            input_, shuffle=False))
                tf_c_grads = [g for g, v in tf_c_grads]
                tf_a_grads = [g for g, v in tf_a_grads]
                tf_e_grads = [g for g, v in tf_e_grads]

            elif fw == "tfe":
                with tf.GradientTape() as tape:
                    tf_loss(policy, policy.model, None, input_)
                c, a, e, t = policy.critic_loss, policy.actor_loss, \
                    policy.alpha_loss, policy.td_error
                vars = tape.watched_variables()
                tf_c_grads = tape.gradient(c[0], vars[6:10])
                tf_a_grads = tape.gradient(a, vars[2:6])
                tf_e_grads = tape.gradient(e, vars[10])

            elif fw == "torch":
                loss_torch(policy, policy.model, None, input_)
                c, a, e, t = policy.critic_loss, policy.actor_loss, \
                    policy.alpha_loss, policy.model.td_error

                # Test actor gradients.
                policy.actor_optim.zero_grad()
                assert all(v.grad is None for v in policy.model.q_variables())
                assert all(v.grad is None
                           for v in policy.model.policy_variables())
                assert policy.model.log_alpha.grad is None
                a.backward()
                # `actor_loss` depends on Q-net vars (but these grads must
                # be ignored and overridden in critic_loss.backward!).
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert not all(
                    torch.min(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert policy.model.log_alpha.grad is None
                # Compare with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                    if v.grad is not None
                ]
                check(tf_a_grads[2],
                      np.transpose(torch_a_grads[0].detach().cpu()))

                # Test critic gradients.
                policy.critic_optims[0].zero_grad()
                assert all(
                    torch.mean(v.grad) == 0.0
                    for v in policy.model.q_variables() if v.grad is not None)
                assert all(
                    torch.min(v.grad) == 0.0
                    for v in policy.model.q_variables() if v.grad is not None)
                assert policy.model.log_alpha.grad is None
                c[0].backward()
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.q_variables() if v.grad is not None)
                assert not all(
                    torch.min(v.grad) == 0
                    for v in policy.model.q_variables() if v.grad is not None)
                assert policy.model.log_alpha.grad is None
                # Compare with tf ones.
                torch_c_grads = [v.grad for v in policy.model.q_variables()]
                check(tf_c_grads[0],
                      np.transpose(torch_c_grads[2].detach().cpu()))
                # Compare (unchanged(!) actor grads) with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                check(tf_a_grads[2],
                      np.transpose(torch_a_grads[0].detach().cpu()))

                # Test alpha gradient.
                policy.alpha_optim.zero_grad()
                assert policy.model.log_alpha.grad is None
                e.backward()
                assert policy.model.log_alpha.grad is not None
                check(policy.model.log_alpha.grad, tf_e_grads)

            check(c, expect_c)
            check(a, expect_a)
            check(e, expect_e)
            check(t, expect_t)

            # Store this framework's losses in prev_fw_loss to compare with
            # next framework's outputs.
            if prev_fw_loss is not None:
                check(c, prev_fw_loss[0])
                check(a, prev_fw_loss[1])
                check(e, prev_fw_loss[2])
                check(t, prev_fw_loss[3])

            prev_fw_loss = (c, a, e, t)

            # Update weights from our batch (n times).
            for update_iteration in range(5):
                print("train iteration {}".format(update_iteration))
                if fw == "tf":
                    in_ = self._get_batch_helper(obs_size, actions, batch_size)
                    tf_inputs.append(in_)
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    updated_weights = policy.get_weights()
                    # Net must have changed.
                    if tf_updated_weights:
                        check(updated_weights["default_policy/fc_1/kernel"],
                              tf_updated_weights[-1]
                              ["default_policy/fc_1/kernel"],
                              false=True)
                    tf_updated_weights.append(updated_weights)

                # Compare with updated tf-weights. Must all be the same.
                else:
                    tf_weights = tf_updated_weights[update_iteration]
                    in_ = tf_inputs[update_iteration]
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    # Compare updated model.
                    for tf_key in sorted(tf_weights.keys()):
                        if re.search("_[23]|alpha", tf_key):
                            continue
                        tf_var = tf_weights[tf_key]
                        torch_var = policy.model.state_dict()[map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var,
                                  np.transpose(torch_var.detach().cpu()),
                                  atol=0.003)
                        else:
                            check(tf_var, torch_var, atol=0.003)
                    # And alpha.
                    check(policy.model.log_alpha,
                          tf_weights["default_policy/log_alpha"])
                    # Compare target nets.
                    for tf_key in sorted(tf_weights.keys()):
                        if not re.search("_[23]", tf_key):
                            continue
                        tf_var = tf_weights[tf_key]
                        torch_var = policy.target_model.state_dict()[
                            map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var,
                                  np.transpose(torch_var.detach().cpu()),
                                  atol=0.003)
                        else:
                            check(tf_var, torch_var, atol=0.003)
            trainer.stop()
Example #11
0
    def test_ddpg_loss_function(self):
        """Tests DDPG loss function results across all frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        config["learning_starts"] = 0
        config["twin_q"] = True
        config["use_huber"] = True
        config["huber_threshold"] = 1.0
        config["gamma"] = 0.99
        # Make this small (seems to introduce errors).
        config["l2_reg"] = 1e-10
        config["prioritized_replay"] = False
        # Use very simple nets.
        config["actor_hiddens"] = [10]
        config["critic_hiddens"] = [10]
        # Make sure, timing differences do not affect trainer.train().
        config["min_iter_time_s"] = 0
        config["timesteps_per_iteration"] = 100

        map_ = {
            # Normal net.
            "default_policy/actor_hidden_0/kernel":
            "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0/bias":
            "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out/kernel":
            "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out/bias":
            "policy_model.action_out."
            "_model.0.bias",
            "default_policy/sequential/q_hidden_0/kernel":
            "q_model.q_hidden_0"
            "._model.0.weight",
            "default_policy/sequential/q_hidden_0/bias":
            "q_model.q_hidden_0."
            "_model.0.bias",
            "default_policy/sequential/q_out/kernel":
            "q_model.q_out._model."
            "0.weight",
            "default_policy/sequential/q_out/bias":
            "q_model.q_out._model."
            "0.bias",
            # -- twin.
            "default_policy/sequential_1/twin_q_hidden_0/kernel":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_1/twin_q_hidden_0/bias":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_1/twin_q_out/kernel":
            "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_1/twin_q_out/bias":
            "twin_"
            "q_model.twin_q_out._model.0.bias",
            # Target net.
            "default_policy/actor_hidden_0_1/kernel":
            "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0_1/bias":
            "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out_1/kernel":
            "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out_1/bias":
            "policy_model.action_out._model"
            ".0.bias",
            "default_policy/sequential_2/q_hidden_0/kernel":
            "q_model."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_2/q_hidden_0/bias":
            "q_model."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_2/q_out/kernel":
            "q_model."
            "q_out._model.0.weight",
            "default_policy/sequential_2/q_out/bias":
            "q_model."
            "q_out._model.0.bias",
            # -- twin.
            "default_policy/sequential_3/twin_q_hidden_0/kernel":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_3/twin_q_hidden_0/bias":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_3/twin_q_out/kernel":
            "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_3/twin_q_out/bias":
            "twin_"
            "q_model.twin_q_out._model.0.bias",
        }

        env = SimpleEnv
        batch_size = 100
        obs_size = (batch_size, 1)
        actions = np.random.random(size=(batch_size, 1))

        # Batch of size=n.
        input_ = self._get_batch_helper(obs_size, actions, batch_size)

        # Simply compare loss values AND grads of all frameworks with each
        # other.
        prev_fw_loss = weights_dict = None
        expect_c, expect_a, expect_t = None, None, None
        # History of tf-updated NN-weights over n training steps.
        tf_updated_weights = []
        # History of input batches used.
        tf_inputs = []
        for fw, sess in framework_iterator(config,
                                           frameworks=("tf", "torch"),
                                           session=True):
            # Generate Trainer and get its default Policy object.
            trainer = ddpg.DDPGTrainer(config=config, env=env)
            policy = trainer.get_policy()
            p_sess = None
            if sess:
                p_sess = policy.get_session()

            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
                assert fw == "tf"  # Start with the tf vars-dict.
                weights_dict = policy.get_weights()
            else:
                assert fw == "torch"  # Then transfer that to torch Model.
                model_dict = self._translate_weights_to_torch(
                    weights_dict, map_)
                policy.model.load_state_dict(model_dict)
                policy.target_model.load_state_dict(model_dict)

            if fw == "torch":
                # Actually convert to torch tensors.
                input_ = policy._lazy_tensor_dict(input_)
                input_ = {k: input_[k] for k in input_.keys()}

            # Only run the expectation once, should be the same anyways
            # for all frameworks.
            if expect_c is None:
                expect_c, expect_a, expect_t = \
                    self._ddpg_loss_helper(
                        input_, weights_dict, sorted(weights_dict.keys()), fw,
                        gamma=config["gamma"],
                        huber_threshold=config["huber_threshold"],
                        l2_reg=config["l2_reg"],
                        sess=sess)

            # Get actual outs and compare to expectation AND previous
            # framework. c=critic, a=actor, e=entropy, t=td-error.
            if fw == "tf":
                c, a, t, tf_c_grads, tf_a_grads = \
                    p_sess.run([
                        policy.critic_loss,
                        policy.actor_loss,
                        policy.td_error,
                        policy._critic_optimizer.compute_gradients(
                            policy.critic_loss,
                            policy.model.q_variables()),
                        policy._actor_optimizer.compute_gradients(
                            policy.actor_loss,
                            policy.model.policy_variables())],
                        feed_dict=policy._get_loss_inputs_dict(
                            input_, shuffle=False))
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                tf_c_grads = [g for g, v in tf_c_grads]
                tf_a_grads = [g for g, v in tf_a_grads]

            elif fw == "torch":
                loss_torch(policy, policy.model, None, input_)
                c, a, t = policy.critic_loss, policy.actor_loss, \
                    policy.td_error
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                # Test actor gradients.
                policy._actor_optimizer.zero_grad()
                assert all(v.grad is None for v in policy.model.q_variables())
                assert all(v.grad is None
                           for v in policy.model.policy_variables())
                a.backward()
                # `actor_loss` depends on Q-net vars
                # (but not twin-Q-net vars!).
                assert not any(v.grad is None
                               for v in policy.model.q_variables()[:4])
                assert all(v.grad is None
                           for v in policy.model.q_variables()[4:])
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert not all(
                    torch.min(v.grad) == 0
                    for v in policy.model.policy_variables())
                # Compare with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)

                # Test critic gradients.
                policy._critic_optimizer.zero_grad()
                assert all(v.grad is None or torch.mean(v.grad) == 0.0
                           for v in policy.model.q_variables())
                assert all(v.grad is None or torch.min(v.grad) == 0.0
                           for v in policy.model.q_variables())
                c.backward()
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.q_variables())
                assert not all(
                    torch.min(v.grad) == 0 for v in policy.model.q_variables())
                # Compare with tf ones.
                torch_c_grads = [v.grad for v in policy.model.q_variables()]
                for tf_g, torch_g in zip(tf_c_grads, torch_c_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)
                # Compare (unchanged(!) actor grads) with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)

            # Store this framework's losses in prev_fw_loss to compare with
            # next framework's outputs.
            if prev_fw_loss is not None:
                check(c, prev_fw_loss[0])
                check(a, prev_fw_loss[1])
                check(t, prev_fw_loss[2])

            prev_fw_loss = (c, a, t)

            # Update weights from our batch (n times).
            for update_iteration in range(6):
                print("train iteration {}".format(update_iteration))
                if fw == "tf":
                    in_ = self._get_batch_helper(obs_size, actions, batch_size)
                    tf_inputs.append(in_)
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    updated_weights = policy.get_weights()
                    # Net must have changed.
                    if tf_updated_weights:
                        check(updated_weights[
                            "default_policy/actor_hidden_0/kernel"],
                              tf_updated_weights[-1]
                              ["default_policy/actor_hidden_0/kernel"],
                              false=True)
                    tf_updated_weights.append(updated_weights)

                # Compare with updated tf-weights. Must all be the same.
                else:
                    tf_weights = tf_updated_weights[update_iteration]
                    in_ = tf_inputs[update_iteration]
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    # Compare updated model and target weights.
                    for tf_key in tf_weights.keys():
                        tf_var = tf_weights[tf_key]
                        # Model.
                        if re.search(
                                "actor_out_1|actor_hidden_0_1|sequential_"
                                "[23]", tf_key):
                            torch_var = policy.target_model.state_dict()[
                                map_[tf_key]]
                        # Target model.
                        else:
                            torch_var = policy.model.state_dict()[map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var,
                                  np.transpose(torch_var.cpu()),
                                  atol=0.1)
                        else:
                            check(tf_var, torch_var, atol=0.1)

            trainer.stop()
Example #12
0
File: cql.py Project: hngenc/ray
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        replay_burn_in=config.get("burn_in", 0),
        replay_zero_init_states=config.get("zero_init_states", True),
        **prio_args)

    global replay_buffer
    replay_buffer = local_replay_buffer

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                samples.policy_batches[policy_id].set_get_interceptor(None)
                prio_dict[policy_id] = (
                    samples.policy_batches[policy_id].get("batch_indexes"),
                    td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(train_step_op) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    return StandardMetricsReporting(replay_op,
                                    workers,
                                    config,
                                    by_steps_trained=True)
Example #13
0
def execution_plan(workers: WorkerSet,
                   config: TrainerConfigDict) -> LocalIterator[dict]:
    """Execution plan of the DQN algorithm. Defines the distributed dataflow.

    Args:
        workers (WorkerSet): The WorkerSet for training the Polic(y/ies)
            of the Trainer.
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        LocalIterator[dict]: A local iterator over training metrics.
    """
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        replay_mode=config["multiagent"]["replay_mode"],
        replay_sequence_length=config.get("replay_sequence_length", 1),
        replay_burn_in=config.get("burn_in", 0),
        replay_zero_init_states=config.get("zero_init_states", True),
        **prio_args)

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                samples.policy_batches[policy_id].set_get_interceptor(None)
                prio_dict[policy_id] = (
                    samples.policy_batches[policy_id].get("batch_indexes"),
                    td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = TrainTFMultiGPU(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(train_step_op) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2). Only return the output
    # of (2) since training metrics are not available until (2) runs.
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1],
                            round_robin_weights=calculate_rr_weights(config))

    return StandardMetricsReporting(train_op, workers, config)
Example #14
0
    def __init__(
        self,
        env,
        batch_size,
        trace_length,
        grid_size,
        exploiter_base_lr,
        exploiter_decay_lr_in_n_epi,
        exploiter_stop_training_after_n_epi,
        train_exploiter_n_times_per_epi,
    ):

        self.stop_training_after_n_epi = exploiter_stop_training_after_n_epi
        self.train_exploiter_n_times_per_epi = train_exploiter_n_times_per_epi

        # with tf.variable_scope(f"dqn_exploiter"):
        # Create the dqn policy for the exploiter
        dqn_config = copy.deepcopy(DEFAULT_CONFIG)
        dqn_config.update({
            "prioritized_replay":
            False,
            "double_q":
            True,
            "buffer_size":
            50000,
            "dueling":
            False,
            "learning_starts":
            min(int((batch_size - 1) * (trace_length - 1)), 64),
            "model": {
                "dim": grid_size,
                "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]],
                # [Channel, [Kernel, Kernel], Stride]]
                # "fcnet_hiddens": [self.env.NUM_ACTIONS],
                "max_seq_len": trace_length,
                # Number of hidden layers for fully connected net
                "fcnet_hiddens": [64],
                # Nonlinearity for fully connected net (tanh, relu)
                "fcnet_activation": "relu",
            },
            # Update the replay buffer with this many samples at once. Note that
            # this setting applies per-worker if num_workers > 1.
            "rollout_fragment_length":
            1,
            # Size of a batch sampled from replay buffer for training. Note that
            # if async_updates is set, then each worker returns gradients for a
            # batch of this size.
            "train_batch_size":
            min(int((batch_size) * (trace_length)), 64),
            "explore":
            False,
            "grad_clip":
            1,
            "gamma":
            0.5,
            "lr":
            exploiter_base_lr,
            # Learning rate schedule
            "lr_schedule": [
                (0, exploiter_base_lr / 1000),
                (100, exploiter_base_lr),
                (exploiter_decay_lr_in_n_epi, exploiter_base_lr / 1e9),
            ],
            "sgd_momentum":
            0.9,
        })
        print("dqn_config", dqn_config)

        self.local_replay_buffer = LocalReplayBuffer(
            num_shards=1,
            learning_starts=dqn_config["learning_starts"],
            buffer_size=dqn_config["buffer_size"],
            replay_batch_size=dqn_config["train_batch_size"],
            replay_mode=dqn_config["multiagent"]["replay_mode"],
            replay_sequence_length=dqn_config["replay_sequence_length"],
        )

        # self.dqn_exploiter = DQNTFPolicy(obs_space=self.env.OBSERVATION_SPACE,
        #                                  action_space=self.env.ACTION_SPACE,
        #                                  config=dqn_config)

        def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer":
            return torch.optim.SGD(
                policy.q_func_vars,
                lr=policy.cur_lr,
                momentum=config["sgd_momentum"],
            )

        MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
            optimizer_fn=sgd_optimizer_dqn)
        self.dqn_policy = MyDQNTorchPolicy(
            obs_space=env.OBSERVATION_SPACE,
            action_space=env.ACTION_SPACE,
            config=dqn_config,
        )

        self.multi_agent_batch_builders = [
            MultiAgentSampleBatchBuilder(
                policy_map={"player_blue": self.dqn_policy},
                clip_rewards=False,
                callbacks=DefaultCallbacks(),
            )
            # for _ in range(self.batch_size)
        ]
Example #15
0
class DQNAgent:
    def __init__(
        self,
        env,
        batch_size,
        trace_length,
        grid_size,
        exploiter_base_lr,
        exploiter_decay_lr_in_n_epi,
        exploiter_stop_training_after_n_epi,
        train_exploiter_n_times_per_epi,
    ):

        self.stop_training_after_n_epi = exploiter_stop_training_after_n_epi
        self.train_exploiter_n_times_per_epi = train_exploiter_n_times_per_epi

        # with tf.variable_scope(f"dqn_exploiter"):
        # Create the dqn policy for the exploiter
        dqn_config = copy.deepcopy(DEFAULT_CONFIG)
        dqn_config.update({
            "prioritized_replay":
            False,
            "double_q":
            True,
            "buffer_size":
            50000,
            "dueling":
            False,
            "learning_starts":
            min(int((batch_size - 1) * (trace_length - 1)), 64),
            "model": {
                "dim": grid_size,
                "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]],
                # [Channel, [Kernel, Kernel], Stride]]
                # "fcnet_hiddens": [self.env.NUM_ACTIONS],
                "max_seq_len": trace_length,
                # Number of hidden layers for fully connected net
                "fcnet_hiddens": [64],
                # Nonlinearity for fully connected net (tanh, relu)
                "fcnet_activation": "relu",
            },
            # Update the replay buffer with this many samples at once. Note that
            # this setting applies per-worker if num_workers > 1.
            "rollout_fragment_length":
            1,
            # Size of a batch sampled from replay buffer for training. Note that
            # if async_updates is set, then each worker returns gradients for a
            # batch of this size.
            "train_batch_size":
            min(int((batch_size) * (trace_length)), 64),
            "explore":
            False,
            "grad_clip":
            1,
            "gamma":
            0.5,
            "lr":
            exploiter_base_lr,
            # Learning rate schedule
            "lr_schedule": [
                (0, exploiter_base_lr / 1000),
                (100, exploiter_base_lr),
                (exploiter_decay_lr_in_n_epi, exploiter_base_lr / 1e9),
            ],
            "sgd_momentum":
            0.9,
        })
        print("dqn_config", dqn_config)

        self.local_replay_buffer = LocalReplayBuffer(
            num_shards=1,
            learning_starts=dqn_config["learning_starts"],
            buffer_size=dqn_config["buffer_size"],
            replay_batch_size=dqn_config["train_batch_size"],
            replay_mode=dqn_config["multiagent"]["replay_mode"],
            replay_sequence_length=dqn_config["replay_sequence_length"],
        )

        # self.dqn_exploiter = DQNTFPolicy(obs_space=self.env.OBSERVATION_SPACE,
        #                                  action_space=self.env.ACTION_SPACE,
        #                                  config=dqn_config)

        def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer":
            return torch.optim.SGD(
                policy.q_func_vars,
                lr=policy.cur_lr,
                momentum=config["sgd_momentum"],
            )

        MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
            optimizer_fn=sgd_optimizer_dqn)
        self.dqn_policy = MyDQNTorchPolicy(
            obs_space=env.OBSERVATION_SPACE,
            action_space=env.ACTION_SPACE,
            config=dqn_config,
        )

        self.multi_agent_batch_builders = [
            MultiAgentSampleBatchBuilder(
                policy_map={"player_blue": self.dqn_policy},
                clip_rewards=False,
                callbacks=DefaultCallbacks(),
            )
            # for _ in range(self.batch_size)
        ]

    def compute_actions(self, obs_batch):
        action, a2, a3 = self.dqn_policy.compute_actions(obs_batch=obs_batch)
        return action, a2, a3

    def add_data_in_rllib_batch_builder(self, s, s1P, trainBatch1, d,
                                        timestep):
        if timestep <= self.stop_training_after_n_epi:
            # for i in range(self.batch_size):
            i = 0
            step_player_values = {
                "eps_id":
                timestep,
                "obs":
                s[i],
                "new_obs":
                s1P[i],
                "actions":
                trainBatch1[1][-1][i],
                "prev_actions":
                trainBatch1[1][-2][i] if len(trainBatch1[1]) > 1 else 0,
                "rewards":
                trainBatch1[2][-1][i],
                "prev_rewards":
                trainBatch1[2][-2][i] if len(trainBatch1[2]) > 1 else 0,
                "dones":
                d[0],
                # done is the same for for every episodes in the batch
            }
            self.multi_agent_batch_builders[i].add_values(
                agent_id="player_blue",
                policy_id="player_blue",
                **step_player_values,
            )
            self.multi_agent_batch_builders[i].count += 1

    def train_dqn_policy(self, timestep):
        stats = {"learner_stats": {}}
        if timestep <= self.stop_training_after_n_epi:
            # Add episodes in replay buffer
            # for i in range(self.batch_size):
            i = 0
            multiagent_batch = self.multi_agent_batch_builders[
                i].build_and_reset()
            self.local_replay_buffer.add_batch(multiagent_batch)

            # update lr in scheduler & in optimizer
            self.dqn_policy.on_global_var_update({"timestep": timestep})
            self.dqn_policy.optimizer()
            if hasattr(self.dqn_policy, "cur_lr"):
                for opt in self.dqn_policy._optimizers:
                    for p in opt.param_groups:
                        p["lr"] = self.dqn_policy.cur_lr
            # Generate training batch and train
            for _ in range(self.train_exploiter_n_times_per_epi):
                replay_batch = self.local_replay_buffer.replay()
                if (
                        replay_batch is not None
                ):  # is None when there is not enough step in the data buffer
                    stats = self.dqn_policy.learn_on_batch(
                        replay_batch.policy_batches["player_blue"])

        stats["learner_stats"]["exploiter_lr_cur"] = self.dqn_policy.cur_lr
        for j, opt in enumerate(self.dqn_policy._optimizers):
            stats["learner_stats"]["exploiter_lr_from_params"] = [
                p["lr"] for p in opt.param_groups
            ][0]
        return stats
Example #16
0
def custom_training_workflow_ppo_ddpg(workers: WorkerSet, config: dict):
    local_replay_buffer = LocalReplayBuffer(num_shards=1,
                                            learning_starts=1000,
                                            buffer_size=50000,
                                            replay_batch_size=64)

    def add_ppo_metrics(batch):
        print("PPO policy learning on samples from",
              batch.policy_batches.keys(), "env steps", batch.env_steps(),
              "agent steps", batch.env_steps())
        metrics = _get_shared_metrics()
        metrics.counters["agent_steps_trained_PPO"] += batch.env_steps()
        return batch

    def add_ddpg_metrics(batch):
        print("DDPG policy learning on samples from",
              batch.policy_batches.keys(), "env steps", batch.env_steps(),
              "agent steps", batch.env_steps())
        metrics = _get_shared_metrics()
        metrics.counters["agent_steps_trained_DDPG"] += batch.env_steps()
        return batch

    # Generate common experiences.
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    r1, r2 = rollouts.duplicate(n=2)

    # PPO sub-flow.
    ppo_train_op = r2.for_each(SelectExperiences(["PPO_policy"])) \
        .combine(ConcatBatches(
            min_batch_size=200)) \
        .for_each(add_ppo_metrics) \
        .for_each(StandardizeFields(["advantages"])) \
        .for_each(TrainOneStep(
            workers,
            policies=["PPO_policy"],
            num_sgd_iter=10,
            sgd_minibatch_size=128))

    # DDPG sub-flow.
    ddpg_train_op = r2.for_each(SelectExperiences(["DDPG_policy"])) \
        .combine(ConcatBatches(
            min_batch_size=200)) \
        .for_each(add_ddpg_metrics) \
        .for_each(StandardizeFields(["advantages"])) \
        .for_each(TrainOneStep(
            workers,
            policies=["DDPG_policy"],
            num_sgd_iter=10,
            sgd_minibatch_size=128))

    # , count_steps_by="env_steps")) \

    # Combined training flow
    train_op = Concurrently([ppo_train_op, ddpg_train_op],
                            mode="async",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)


# if __name__ == "__main__":
#     args = parser.parse_args()
#     assert not (args.torch and args.mixed_torch_tf),\
#         "Use either --torch or --mixed-torch-tf, not both!"

#     ray.init()

#     # Simple environment with 4 independent cartpole entities
#     register_env("multi_agent_cartpole",
#                  lambda _: MultiAgentCartPole({"num_agents": 4}))
#     single_env = gym.make("CartPole-v0")
#     obs_space = single_env.observation_space
#     act_space = single_env.action_space

#     # Note that since the trainer below does not include a default policy or
#     # policy configs, we have to explicitly set it in the multiagent config:
#     policies = {
#         "ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else
#                        PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
#         "dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy,
#                        obs_space, act_space, DQN_CONFIG),
#     }

#     def policy_mapping_fn(agent_id):
#         if agent_id % 2 == 0:
#             return "ppo_policy"
#         else:
#             return "dqn_policy"

#     MyTrainer = build_trainer(
#         name="PPO_DQN_MultiAgent",
#         default_policy=None,
#         execution_plan=custom_training_workflow)

#     config = {
#         "rollout_fragment_length": 50,
#         "num_workers": 0,
#         "env": "multi_agent_cartpole",
#         "multiagent": {
#             "policies": policies,
#             "policy_mapping_fn": policy_mapping_fn,
#             "policies_to_train": ["dqn_policy", "ppo_policy"],
#         },
#         # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
#         "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
#         "framework": "torch" if args.torch else "tf",
#         "_use_trajectory_view_api": True,
#     }

#     stop = {
#         "training_iteration": args.stop_iters,
#         "timesteps_total": args.stop_timesteps,
#         "episode_reward_mean": args.stop_reward,
#     }

#     results = tune.run(MyTrainer, config=config, stop=stop)

#     if args.as_test:
#         check_learning_achieved(results, args.stop_reward)

#     ray.shutdown()
Example #17
0
def execution_plan(workers, config):
    if config.get("prioritized_replay"):
        prio_args = {
            "prioritized_replay_alpha": config["prioritized_replay_alpha"],
            "prioritized_replay_beta": config["prioritized_replay_beta"],
            "prioritized_replay_eps": config["prioritized_replay_eps"],
        }
    else:
        prio_args = {}

    local_replay_buffer = LocalReplayBuffer(
        num_shards=1,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        replay_batch_size=config["train_batch_size"],
        multiagent_sync_replay=config.get("multiagent_sync_replay"),
        **prio_args)

    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # We execute the following steps concurrently:
    # (1) Generate rollouts and store them in our local replay buffer. Calling
    # next() on store_op drives this.
    store_op = rollouts.for_each(
        StoreToReplayBuffer(local_buffer=local_replay_buffer))

    def update_prio(item):
        samples, info_dict = item
        if config.get("prioritized_replay"):
            prio_dict = {}
            for policy_id, info in info_dict.items():
                # TODO(sven): This is currently structured differently for
                #  torch/tf. Clean up these results/info dicts across
                #  policies (note: fixing this in torch_policy.py will
                #  break e.g. DDPPO!).
                td_error = info.get("td_error",
                                    info[LEARNER_STATS_KEY].get("td_error"))
                prio_dict[policy_id] = (samples.policy_batches[policy_id].data.
                                        get("batch_indexes"), td_error)
            local_replay_buffer.update_priorities(prio_dict)
        return info_dict

    # (2) Read and train on experiences from the replay buffer. Every batch
    # returned from the LocalReplay() iterator is passed to TrainOneStep to
    # take a SGD step, and then we decide whether to update the target network.
    post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(lambda x: post_fn(x, workers, config)) \
        .for_each(TrainOneStep(workers)) \
        .for_each(update_prio) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2). Only return the output
    # of (2) since training metrics are not available until (2) runs.
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1],
                            round_robin_weights=calculate_rr_weights(config))

    return StandardMetricsReporting(train_op, workers, config)
Example #18
0
    def test_sac_loss_function(self):
        """Tests SAC loss function results across all frameworks."""
        config = sac.DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        config["learning_starts"] = 0
        config["twin_q"] = False
        config["gamma"] = 0.99
        # Switch on deterministic loss so we can compare the loss values.
        config["_deterministic_loss"] = True
        # Use very simple nets.
        config["Q_model"]["fcnet_hiddens"] = [10]
        config["policy_model"]["fcnet_hiddens"] = [10]
        # Make sure, timing differences do not affect trainer.train().
        config["min_iter_time_s"] = 0

        map_ = {
            # Normal net.
            "default_policy/sequential/action_1/kernel": "action_model."
            "action_0._model.0.weight",
            "default_policy/sequential/action_1/bias": "action_model."
            "action_0._model.0.bias",
            "default_policy/sequential/action_out/kernel": "action_model."
            "action_out._model.0.weight",
            "default_policy/sequential/action_out/bias": "action_model."
            "action_out._model.0.bias",
            "default_policy/sequential_1/q_hidden_0/kernel": "q_net."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_1/q_hidden_0/bias": "q_net."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_1/q_out/kernel": "q_net."
            "q_out._model.0.weight",
            "default_policy/sequential_1/q_out/bias": "q_net."
            "q_out._model.0.bias",
            "default_policy/value_out/kernel": "_value_branch."
            "_model.0.weight",
            "default_policy/value_out/bias": "_value_branch."
            "_model.0.bias",
            # Target net.
            "default_policy/sequential_2/action_1/kernel": "action_model."
            "action_0._model.0.weight",
            "default_policy/sequential_2/action_1/bias": "action_model."
            "action_0._model.0.bias",
            "default_policy/sequential_2/action_out/kernel": "action_model."
            "action_out._model.0.weight",
            "default_policy/sequential_2/action_out/bias": "action_model."
            "action_out._model.0.bias",
            "default_policy/sequential_3/q_hidden_0/kernel": "q_net."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_3/q_hidden_0/bias": "q_net."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_3/q_out/kernel": "q_net."
            "q_out._model.0.weight",
            "default_policy/sequential_3/q_out/bias": "q_net."
            "q_out._model.0.bias",
            "default_policy/value_out_1/kernel": "_value_branch."
            "_model.0.weight",
            "default_policy/value_out_1/bias": "_value_branch."
            "_model.0.bias",
        }

        env = SimpleEnv
        batch_size = 100
        if env is SimpleEnv:
            obs_size = (batch_size, 1)
            actions = np.random.random(size=(batch_size, 1))
        elif env == "CartPole-v0":
            obs_size = (batch_size, 4)
            actions = np.random.randint(0, 2, size=(batch_size, ))
        else:
            obs_size = (batch_size, 3)
            actions = np.random.random(size=(batch_size, 1))

        # Batch of size=n.
        input_ = self._get_batch_helper(obs_size, actions, batch_size)

        # Simply compare loss values AND grads of all frameworks with each
        # other.
        prev_fw_loss = weights_dict = None
        expect_c, expect_a, expect_e, expect_t = None, None, None, None
        # History of tf-updated NN-weights over n training steps.
        tf_updated_weights = []
        # History of input batches used.
        tf_inputs = []
        for fw, sess in framework_iterator(config,
                                           frameworks=("tf", "torch"),
                                           session=True):
            # Generate Trainer and get its default Policy object.
            trainer = sac.SACTrainer(config=config, env=env)
            policy = trainer.get_policy()
            p_sess = None
            if sess:
                p_sess = policy.get_session()

            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
                assert fw == "tf"  # Start with the tf vars-dict.
                weights_dict = policy.get_weights()
            else:
                assert fw == "torch"  # Then transfer that to torch Model.
                model_dict = self._translate_weights_to_torch(
                    weights_dict, map_)
                policy.model.load_state_dict(model_dict)
                policy.target_model.load_state_dict(model_dict)

            if fw == "tf":
                log_alpha = weights_dict["default_policy/log_alpha"]
            elif fw == "torch":
                # Actually convert to torch tensors.
                input_ = policy._lazy_tensor_dict(input_)
                input_ = {k: input_[k] for k in input_.keys()}
                log_alpha = policy.model.log_alpha.detach().numpy()[0]

            # Only run the expectation once, should be the same anyways
            # for all frameworks.
            if expect_c is None:
                expect_c, expect_a, expect_e, expect_t = \
                    self._sac_loss_helper(input_, weights_dict,
                                          sorted(weights_dict.keys()),
                                          log_alpha, fw,
                                          gamma=config["gamma"], sess=sess)

            # Get actual outs and compare to expectation AND previous
            # framework. c=critic, a=actor, e=entropy, t=td-error.
            if fw == "tf":
                c, a, e, t, tf_c_grads, tf_a_grads, tf_e_grads = \
                    p_sess.run([
                        policy.critic_loss,
                        policy.actor_loss,
                        policy.alpha_loss,
                        policy.td_error,
                        policy.optimizer().compute_gradients(
                            policy.critic_loss[0],
                            policy.model.q_variables()),
                        policy.optimizer().compute_gradients(
                            policy.actor_loss,
                            policy.model.policy_variables()),
                        policy.optimizer().compute_gradients(
                            policy.alpha_loss, policy.model.log_alpha)],
                        feed_dict=policy._get_loss_inputs_dict(
                            input_, shuffle=False))
                tf_c_grads = [g for g, v in tf_c_grads]
                tf_a_grads = [g for g, v in tf_a_grads]
                tf_e_grads = [g for g, v in tf_e_grads]

            elif fw == "torch":
                loss_torch(policy, policy.model, None, input_)
                c, a, e, t = policy.critic_loss, policy.actor_loss, \
                    policy.alpha_loss, policy.td_error

                # Test actor gradients.
                policy.actor_optim.zero_grad()
                assert all(v.grad is None for v in policy.model.q_variables())
                assert all(v.grad is None
                           for v in policy.model.policy_variables())
                assert policy.model.log_alpha.grad is None
                a.backward()
                # `actor_loss` depends on Q-net vars (but these grads must
                # be ignored and overridden in critic_loss.backward!).
                assert not any(v.grad is None
                               for v in policy.model.q_variables())
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert not all(
                    torch.min(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert policy.model.log_alpha.grad is None
                # Compare with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g))
                    else:
                        check(tf_g, torch_g)

                # Test critic gradients.
                policy.critic_optims[0].zero_grad()
                assert all(
                    torch.mean(v.grad) == 0.0
                    for v in policy.model.q_variables())
                assert all(
                    torch.min(v.grad) == 0.0
                    for v in policy.model.q_variables())
                assert policy.model.log_alpha.grad is None
                c[0].backward()
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.q_variables())
                assert not all(
                    torch.min(v.grad) == 0 for v in policy.model.q_variables())
                assert policy.model.log_alpha.grad is None
                # Compare with tf ones.
                torch_c_grads = [v.grad for v in policy.model.q_variables()]
                for tf_g, torch_g in zip(tf_c_grads, torch_c_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g))
                    else:
                        check(tf_g, torch_g)
                # Compare (unchanged(!) actor grads) with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g))
                    else:
                        check(tf_g, torch_g)

                # Test alpha gradient.
                policy.alpha_optim.zero_grad()
                assert policy.model.log_alpha.grad is None
                e.backward()
                assert policy.model.log_alpha.grad is not None
                check(policy.model.log_alpha.grad, tf_e_grads)

            check(c, expect_c)
            check(a, expect_a)
            check(e, expect_e)
            check(t, expect_t)

            # Store this framework's losses in prev_fw_loss to compare with
            # next framework's outputs.
            if prev_fw_loss is not None:
                check(c, prev_fw_loss[0])
                check(a, prev_fw_loss[1])
                check(e, prev_fw_loss[2])
                check(t, prev_fw_loss[3])

            prev_fw_loss = (c, a, e, t)

            # Update weights from our batch (n times).
            for update_iteration in range(10):
                print("train iteration {}".format(update_iteration))
                if fw == "tf":
                    in_ = self._get_batch_helper(obs_size, actions, batch_size)
                    tf_inputs.append(in_)
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    updated_weights = policy.get_weights()
                    # Net must have changed.
                    if tf_updated_weights:
                        check(updated_weights[
                            "default_policy/sequential/action_1/kernel"],
                              tf_updated_weights[-1]
                              ["default_policy/sequential/action_1/kernel"],
                              false=True)
                    tf_updated_weights.append(updated_weights)

                # Compare with updated tf-weights. Must all be the same.
                else:
                    tf_weights = tf_updated_weights[update_iteration]
                    in_ = tf_inputs[update_iteration]
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    # Compare updated model.
                    for tf_key in sorted(tf_weights.keys())[2:10]:
                        tf_var = tf_weights[tf_key]
                        torch_var = policy.model.state_dict()[map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var, np.transpose(torch_var), rtol=0.05)
                        else:
                            check(tf_var, torch_var, rtol=0.05)
                    # And alpha.
                    check(policy.model.log_alpha,
                          tf_weights["default_policy/log_alpha"])
                    # Compare target nets.
                    for tf_key in sorted(tf_weights.keys())[10:18]:
                        tf_var = tf_weights[tf_key]
                        torch_var = policy.target_model.state_dict()[
                            map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var, np.transpose(torch_var), rtol=0.05)
                        else:
                            check(tf_var, torch_var, rtol=0.05)