def build_a3c_moa_trainer(moa_config):
    tf.keras.backend.set_floatx("float32")
    trainer_name = "MOAA3CTrainer"
    moa_config["use_gae"] = False

    a3c_tf_policy = build_tf_policy(
        name="A3CAuxTFPolicy",
        get_default_config=lambda: moa_config,
        loss_fn=actor_critic_loss,
        stats_fn=stats,
        grad_stats_fn=grad_stats,
        gradients_fn=clip_gradients,
        postprocess_fn=postprocess_a3c_moa,
        extra_action_fetches_fn=add_value_function_fetch,
        before_loss_init=setup_mixins,
        mixins=[ValueNetworkMixin, LearningRateSchedule] + get_moa_mixins(),
    )

    trainer = build_trainer(
        name=trainer_name,
        default_policy=a3c_tf_policy,
        default_config=moa_config,
        validate_config=validate_config,
    )

    return trainer
Example #2
0
def get_moa_vtrace_policy():
    moa_vtrace_policy = build_tf_policy(
        name="MOAVTracePolicy",
        get_default_config=lambda: MOA_CONFIG,
        loss_fn=build_vtrace_loss,
        stats_fn=moa_stats,
        grad_stats_fn=grad_stats,
        postprocess_fn=postprocess_trajectory,
        optimizer_fn=choose_optimizer,
        gradients_fn=clip_gradients,
        extra_action_fetches_fn=add_behaviour_logits,
        before_init=validate_config_policy,
        before_loss_init=setup_mixins,
        mixins=[LearningRateSchedule, EntropyCoeffSchedule] + get_moa_mixins(),
        get_batch_divisibility_req=lambda p: p.config["rollout_fragment_length"
                                                      ],
    )
    return moa_vtrace_policy
def build_ppo_moa_trainer(moa_config):
    """
    Creates a MOA+PPO policy class, then creates a trainer with this policy.
    :param moa_config: The configuration dictionary.
    :return: A new MOA+PPO trainer.
    """
    tf.keras.backend.set_floatx("float32")

    trainer_name = "MOAPPOTrainer"

    moa_ppo_policy = build_tf_policy(
        name="MOAPPOTFPolicy",
        get_default_config=lambda: moa_config,
        loss_fn=loss_with_moa,
        make_model=build_model,
        stats_fn=extra_moa_stats,
        extra_action_fetches_fn=extra_moa_fetches,
        postprocess_fn=postprocess_ppo_moa,
        gradients_fn=clip_gradients,
        before_init=setup_config,
        before_loss_init=setup_ppo_moa_mixins,
        mixins=[
            LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
            ValueNetworkMixin
        ] + get_moa_mixins(),
    )

    moa_ppo_trainer = build_trainer(
        name=trainer_name,
        default_policy=moa_ppo_policy,
        make_policy_optimizer=choose_policy_optimizer,
        default_config=moa_config,
        validate_config=validate_ppo_moa_config,
        after_optimizer_step=update_kl,
        after_train_result=warn_about_bad_reward_scales,
        mixins=[MOAResetConfigMixin],
    )

    return moa_ppo_trainer
Example #4
0
        return penalty

    logits, _ = model.from_batch(train_batch)
    action_dist = dist_class(logits, model)

    actions = train_batch[SampleBatch.ACTIONS]
    rewards = train_batch[SampleBatch.REWARDS]
    penalty = tf.py_function(compute_penalty, [actions, rewards],
                             Tout=tf.float32)

    return penalty - tf.reduce_mean(action_dist.logp(actions) * rewards)


# <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'>
MyTFPolicy = build_tf_policy(
    name="MyTFPolicy",
    loss_fn=policy_gradient_loss,
)

# <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
MyTrainer = build_trainer(
    name="MyCustomTrainer",
    default_policy=MyTFPolicy,
)

if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    ModelCatalog.register_custom_model("eager_model", EagerModel)

    config = {
        "env": "CartPole-v0",
Example #5
0
    elif isinstance(action_space,
                    (Box, Simplex)) and len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space ({}) of {} has multiple dimensions "
            "{}. ".format(action_space, policy, action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")


# Build a child class of `DynamicTFPolicy`, given the custom functions defined
# above.
SACTFPolicy = build_tf_policy(
    name="SACTFPolicy",
    get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
    make_model=build_sac_model,
    postprocess_fn=postprocess_trajectory,
    action_distribution_fn=get_distribution_inputs_and_class,
    loss_fn=sac_actor_critic_loss,
    stats_fn=stats,
    compute_gradients_fn=compute_and_clip_gradients,
    apply_gradients_fn=apply_gradients,
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
    mixins=[
        TargetNetworkMixin, ActorCriticOptimizerMixin, ComputeTDErrorMixin
    ],
    validate_spaces=validate_spaces,
    before_init=setup_early_mixins,
    before_loss_init=setup_mid_mixins,
    after_init=setup_late_mixins,
)
Example #6
0
            "DDPG.".format(action_space, policy))
    elif len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space ({}) of {} has multiple dimensions "
            "{}. ".format(action_space, policy, action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")


DDPGTFPolicy = build_tf_policy(
    name="DDPGTFPolicy",
    get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
    make_model=build_ddpg_models,
    action_distribution_fn=get_distribution_inputs_and_class,
    loss_fn=ddpg_actor_critic_loss,
    stats_fn=build_ddpg_stats,
    postprocess_fn=postprocess_nstep_and_prio,
    compute_gradients_fn=gradients_fn,
    apply_gradients_fn=build_apply_op,
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
    validate_spaces=validate_spaces,
    before_init=setup_early_mixins,
    before_loss_init=setup_mid_mixins,
    after_init=setup_late_mixins,
    mixins=[
        TargetNetworkMixin,
        ActorCriticOptimizerMixin,
        ComputeTDErrorMixin,
    ],
)
Example #7
0
        self._value = value


def setup_config(policy, obs_space, action_space, config):
    # auto set the model option for layer sharing
    config["model"]["vf_share_layers"] = config["vf_share_layers"]


def setup_mixins(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    KLCoeffMixin.__init__(policy, config)
    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                  config["entropy_coeff_schedule"])
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])


PPOTFPolicy = build_tf_policy(
    name="PPOTFPolicy",
    get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
    loss_fn=ppo_surrogate_loss,
    stats_fn=kl_and_loss_stats,
    extra_action_fetches_fn=vf_preds_fetches,
    postprocess_fn=postprocess_ppo_gae,
    gradients_fn=clip_gradients,
    before_init=setup_config,
    before_loss_init=setup_mixins,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        ValueNetworkMixin
    ])
Example #8
0
        return td_err


def setup_early_mixins(policy, obs_space, action_space, config):
    ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
    ActorCriticOptimizerMixin.__init__(policy, config)


def setup_late_mixins(policy, obs_space, action_space, config):
    TargetNetworkMixin.__init__(policy, config)


SACTFPolicy = build_tf_policy(
    name="SACTFPolicy",
    get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
    make_model=build_sac_model,
    postprocess_fn=postprocess_trajectory,
    extra_action_feed_fn=exploration_setting_inputs,
    action_sampler_fn=build_action_output,
    loss_fn=actor_critic_loss,
    stats_fn=stats,
    gradients_fn=gradients,
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
    mixins=[
        TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin,
        ComputeTDErrorMixin
    ],
    before_init=setup_early_mixins,
    after_init=setup_late_mixins,
    obs_include_prev_action_reward=False)
Example #9
0
                       config: TrainerConfigDict) -> None:
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])


def before_loss_init(policy: Policy, obs_space: gym.spaces.Space,
                     action_space: gym.spaces.Space,
                     config: TrainerConfigDict) -> None:
    ComputeTDErrorMixin.__init__(policy)
    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)


R2D2TFPolicy = build_tf_policy(
    name="R2D2TFPolicy",
    loss_fn=r2d2_loss,
    get_default_config=lambda: ray.rllib.agents.dqn.r2d2.DEFAULT_CONFIG,
    postprocess_fn=postprocess_nstep_and_prio,
    stats_fn=build_q_stats,
    make_model=build_r2d2_model,
    action_distribution_fn=get_distribution_inputs_and_class,
    optimizer_fn=adam_optimizer,
    extra_action_out_fn=lambda policy: {"q_values": policy.q_values},
    compute_gradients_fn=clip_gradients,
    extra_learn_fetches_fn=lambda policy: {"td_error": policy._td_error},
    before_init=setup_early_mixins,
    before_loss_init=before_loss_init,
    mixins=[
        TargetNetworkMixin,
        ComputeTDErrorMixin,
        LearningRateSchedule,
    ])
Example #10
0
    return {
        "policy_loss": policy.loss.p_loss,
        "vf_loss": policy.loss.v_loss,
        "total_loss": policy.loss.total_loss,
        "vf_explained_var": policy.loss.explained_variance,
    }


def setup_mixins(policy: Policy, obs_space: gym.spaces.Space,
                 action_space: gym.spaces.Space,
                 config: TrainerConfigDict) -> None:
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    # Set up a tf-var for the moving avg (do this here to make it work with
    # eager mode); "c^2" in the paper.
    policy._moving_average_sqd_adv_norm = get_variable(
        100.0,
        framework="tf",
        tf_name="moving_average_of_advantage_norm",
        trainable=False)


MARWILTFPolicy = build_tf_policy(
    name="MARWILTFPolicy",
    get_default_config=lambda: ray.rllib.agents.marwil.marwil.DEFAULT_CONFIG,
    loss_fn=marwil_loss,
    stats_fn=stats,
    postprocess_fn=postprocess_advantages,
    before_loss_init=setup_mixins,
    compute_gradients_fn=compute_and_clip_gradients,
    mixins=[ValueNetworkMixin])
Example #11
0
                                              config["epsilon"])


def clip_gradients(policy, optimizer, loss):
    grads_and_vars = optimizer.compute_gradients(
        loss, policy.model.trainable_variables())
    grads = [g for (g, v) in grads_and_vars]
    policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
    clipped_grads = list(zip(policy.grads, policy.model.trainable_variables()))
    return clipped_grads


def setup_mixins(policy, obs_space, action_space, config):
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                  config["entropy_coeff_schedule"])


VTraceTFPolicy = build_tf_policy(
    name="VTraceTFPolicy",
    get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
    loss_fn=build_vtrace_loss,
    stats_fn=stats,
    grad_stats_fn=grad_stats,
    postprocess_fn=postprocess_trajectory,
    optimizer_fn=choose_optimizer,
    gradients_fn=clip_gradients,
    before_loss_init=setup_mixins,
    mixins=[LearningRateSchedule, EntropyCoeffSchedule],
    get_batch_divisibility_req=lambda p: p.config["rollout_fragment_length"])
Example #12
0
                model_cls = DiscreteLinearModelThompsonSampling
        elif exploration_config["type"] == "UpperConfidenceBound":
            if isinstance(original_space, spaces.Dict):
                assert (
                    "item" in original_space.spaces
                ), "Cannot find 'item' key in observation space"
                model_cls = ParametricLinearModelUCB
            else:
                model_cls = DiscreteLinearModelUCB

    model = model_cls(
        obs_space, action_space, logit_dim, config["model"], name="LinearModel"
    )
    return model


def after_init(policy, *args):
    policy.regrets = []
    BanditPolicyOverrides.__init__(policy)


BanditTFPolicy = build_tf_policy(
    name="BanditTFPolicy",
    get_default_config=lambda: ray.rllib.algorithms.bandit.bandit.DEFAULT_CONFIG,
    validate_spaces=validate_spaces,
    make_model=make_model,
    loss_fn=None,
    mixins=[BanditPolicyOverrides],
    after_init=after_init,
)
Example #13
0
from benchmark.networks.communicate import NetworkedMixin, postprocess_trajectory


def networked_pg_loss(policy, model, dist_class, train_batch):
    # make gradients accessed
    for k in train_batch.keys():
        if "var" in k or "gamma" in k:
            _ = train_batch[k].shape

    return pg_tf_loss(policy, model, dist_class, train_batch)


def setupmixin(policy, obs_space, action_space, config):
    NetworkedMixin.__init__(policy)


NetworkedPG = build_tf_policy(
    name="NetworkedPG",
    get_default_config=lambda: PG_DEFAULT_CONFIG,
    postprocess_fn=postprocess_trajectory,
    loss_fn=networked_pg_loss,
    mixins=[NetworkedMixin],
    after_init=setupmixin,
)


NetworkedPGTrainer = build_trainer(
    name="NetworkedPGTrainer",
    default_policy=NetworkedPG,
)
Example #14
0
    if get_custom_option(policy, 'use_vf_adv'):
        vf_preds = trajectory[SampleBatch.VF_PREDS]
        rewards = (rewards - vf_preds)
    
    trajectory[Postprocessing.ADVANTAGES] = rewards.copy().astype(np.float32)
    return trajectory

def extra_action_fetches(policy):
    fetches = { SampleBatch.VF_PREDS: policy.model.value_function() }
    if has_method(policy.model, 'extra_compute_action_fetches'):
        model_fetches = policy.model.extra_compute_action_fetches()
        fetches.update(model_fetches)
    return fetches

def stats(policy, train_batch):
    stats =  {
        'action_logp_min': tf.reduce_min(train_batch[ACTION_LOGP]),
        'action_logp_max': tf.reduce_max(train_batch[ACTION_LOGP]),
        'action_logp_mean': tf.reduce_mean(train_batch[ACTION_LOGP]),
    }
    return stats

PGPolicy = build_tf_policy(
    name='pg_policy',
    loss_fn=policy_gradient_loss,
    get_default_config=ConstantFunctor(DEFAULT_CONFIG),
    postprocess_fn=postprocess_sample_batch,
    extra_action_fetches_fn=extra_action_fetches,
    stats_fn=stats
)
Example #15
0
    @override(TFPolicy)
    def variables(self):
        return self.model.variables() + self.target_model.variables()


def setup_late_mixins(policy, obs_space, action_space, config):
    TargetNetworkMixin.__init__(policy, config)


DDPGTFPolicy = build_tf_policy(
    name="DQNTFPolicy",
    get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
    make_model=build_ddpg_models,
    action_distribution_fn=get_distribution_inputs_and_class,
    loss_fn=ddpg_actor_critic_loss,
    stats_fn=build_ddpg_stats,
    postprocess_fn=postprocess_nstep_and_prio,
    optimizer_fn=make_ddpg_optimizers,
    gradients_fn=gradients_fn,
    apply_gradients_fn=build_apply_op,
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
    before_init=before_init_fn,
    before_loss_init=setup_mid_mixins,
    after_init=setup_late_mixins,
    obs_include_prev_action_reward=False,
    mixins=[
        TargetNetworkMixin,
        ComputeTDErrorMixin,
    ])

def compute_q_values(policy, model, obs, explore, is_training=None):
    model_out, _ = model(
        {
            SampleBatch.CUR_OBS:
            obs,
            "is_training":
            is_training if is_training is not None else
            policy._get_is_training_placeholder(),
        }, [], None)

    return model_out


def setup_late_mixins(policy, obs_space, action_space, config):
    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)


SimpleQTFPolicy = build_tf_policy(
    name="SimpleQTFPolicy",
    get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
    make_model=build_q_models,
    action_distribution_fn=get_distribution_inputs_and_class,
    loss_fn=build_q_losses,
    extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
    after_init=setup_late_mixins,
    obs_include_prev_action_reward=False,
    mixins=[TargetNetworkMixin])
Example #17
0
        new_priorities = (np.abs(td_errors) +
                          policy.config["prioritized_replay_eps"])
        batch.data[PRIO_WEIGHTS] = new_priorities

    return batch


DQNTFPolicy = build_tf_policy(
    name="DQNTFPolicy",
    get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
    make_model=build_q_model,
    action_sampler_fn=build_q_networks,
    loss_fn=build_q_losses,
    stats_fn=build_q_stats,
    postprocess_fn=postprocess_trajectory,
    optimizer_fn=adam_optimizer,
    gradients_fn=clip_gradients,
    extra_action_feed_fn=exploration_setting_inputs,
    extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
    update_ops_fn=lambda policy: policy.q_batchnorm_update_ops,
    before_init=setup_early_mixins,
    after_init=setup_late_mixins,
    obs_include_prev_action_reward=False,
    mixins=[
        ExplorationStateMixin,
        TargetNetworkMixin,
        ComputeTDErrorMixin,
        LearningRateSchedule,
    ])
Example #18
0
DEFAULT_CONFIG = with_common_config({
    "gamma": 0.95,
    "lambda": 1.0,  # if gae=true, work for it.
    "use_gae": False,
    "vf_loss_coeff": 0.5,
    "entropy_coeff": 0.01,
    "truncate_episodes": True,
    "use_critic": True,
    "grad_clip": 40.0,
    "lr": 0.0001,
    "min_iter_time_s": 5,
    "sample_async": True,
    "lr_schedule": None,
})

CA2CTFPolicy = build_tf_policy(
    name="CA2CTFPolicy",
    stats_fn=stats,
    grad_stats_fn=central_vf_stats,
    loss_fn=ac_loss_func,
    postprocess_fn=postprocess_trajectory,
    before_loss_init=setup_mixins,
    make_model=build_cac_model,
    mixins=[CentralizedValueMixin],
    get_default_config=lambda: DEFAULT_CONFIG,
)

CA2CTrainer = build_trainer(name="CA2C",
                            default_policy=CA2CTFPolicy,
                            default_config=DEFAULT_CONFIG)
    return -tf.reduce_mean(
        action_dist.logp(train_batch["actions"]) * train_batch["returns"])


def calculate_advantages(policy,
                         sample_batch,
                         other_agent_batches=None,
                         episode=None):
    sample_batch["returns"] = discount_cumsum(sample_batch["rewards"], 0.99)
    return sample_batch


# <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'>
MyTFPolicy = build_tf_policy(
    name="MyTFPolicy",
    loss_fn=policy_gradient_loss,
    postprocess_fn=calculate_advantages,
)

# <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
MyTrainer = build_trainer(
    name="MyCustomTrainer",
    default_policy=MyTFPolicy,
)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)
    tune.run(
        MyTrainer,
        stop={"training_iteration": args.stop_iters},
Example #20
0
            return
        # Tf static graph -> Return grouped op.
        else:
            alpha_prime_apply_op = policy._alpha_prime_optimizer.apply_gradients(
                policy._alpha_prime_grads_and_vars,
                global_step=tf1.train.get_or_create_global_step(),
            )
            return tf.group([sac_results, alpha_prime_apply_op])
    return sac_results


# Build a child class of `TFPolicy`, given the custom functions defined
# above.
CQLTFPolicy = build_tf_policy(
    name="CQLTFPolicy",
    loss_fn=cql_loss,
    get_default_config=lambda: ray.rllib.agents.cql.cql.CQL_DEFAULT_CONFIG,
    validate_spaces=validate_spaces,
    stats_fn=cql_stats,
    postprocess_fn=postprocess_trajectory,
    before_init=setup_early_mixins,
    after_init=setup_late_mixins,
    make_model=build_sac_model,
    mixins=[
        ActorCriticOptimizerMixin, TargetNetworkMixin, ComputeTDErrorMixin
    ],
    action_distribution_fn=get_distribution_inputs_and_class,
    compute_gradients_fn=compute_gradients_fn,
    apply_gradients_fn=apply_gradients_fn,
)
Example #21
0
    return loss


def pg_loss_stats(policy: Policy,
                  train_batch: SampleBatch) -> Dict[str, TensorType]:
    """Returns the calculated loss in a stats dict.

    Args:
        policy (Policy): The Policy object.
        train_batch (SampleBatch): The data used for training.

    Returns:
        Dict[str, TensorType]: The stats dict.
    """

    return {
        "policy_loss": policy.policy_loss,
    }


# Build a child class of `DynamicTFPolicy`, given the extra options:
# - trajectory post-processing function (to calculate advantages)
# - PG loss function
PGTFPolicy = build_tf_policy(
    name="PGTFPolicy",
    get_default_config=lambda: ray.rllib.agents.pg.DEFAULT_CONFIG,
    postprocess_fn=post_process_advantages,
    stats_fn=pg_loss_stats,
    loss_fn=pg_tf_loss)
Example #22
0
            self.get_placeholder(SampleBatch.CUR_OBS): [ob],
            self.seq_lens: [1]
        }
        assert len(args) == len(self.state_in), \
            (args, self.state_in)
        for k, v in zip(self.state_in, args):
            feed_dict[k] = v
        vf = self.get_session().run(self.value_function, feed_dict)
        return vf[0]


def setup_mixins(policy, obs_space, action_space, config):
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
    ValueNetworkMixin.__init__(policy)


AsyncPPOTFPolicy = build_tf_policy(
    name="AsyncPPOTFPolicy",
    get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
    loss_fn=build_appo_surrogate_loss,
    stats_fn=stats,
    grad_stats_fn=grad_stats,
    postprocess_fn=postprocess_trajectory,
    optimizer_fn=choose_optimizer,
    gradients_fn=clip_gradients,
    extra_action_fetches_fn=add_values_and_logits,
    before_init=validate_config,
    before_loss_init=setup_mixins,
    mixins=[LearningRateSchedule, ValueNetworkMixin],
    get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
Example #23
0
        explained_variance(
            policy.get_placeholder(Postprocessing.VALUE_TARGETS), policy.vf),
    }


def clip_gradients(policy, optimizer, loss):
    grads = tf.gradients(loss, policy.var_list)
    grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
    clipped_grads = list(zip(grads, policy.var_list))
    return clipped_grads


def setup_mixins(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy)
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
    policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        tf.get_variable_scope().name)


A3CTFPolicy = build_tf_policy(
    name="A3CTFPolicy",
    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
    loss_fn=actor_critic_loss,
    stats_fn=stats,
    grad_stats_fn=grad_stats,
    gradients_fn=clip_gradients,
    postprocess_fn=postprocess_advantages,
    extra_action_fetches_fn=add_value_function_fetch,
    before_loss_init=setup_mixins,
    mixins=[ValueNetworkMixin, LearningRateSchedule])
Example #24
0

def setup_mixins(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy, config)
    KLCoeffMixin.__init__(policy, config)

    # Create the `split` placeholder.
    policy._loss_input_dict["split"] = tf1.placeholder(
        tf.int32,
        name="Meta-Update-Splitting",
        shape=(
            policy.config["inner_adaptation_steps"] + 1,
            policy.config["num_workers"],
        ),
    )


MAMLTFPolicy = build_tf_policy(
    name="MAMLTFPolicy",
    get_default_config=lambda: ray.rllib.algorithms.maml.maml.DEFAULT_CONFIG,
    loss_fn=maml_loss,
    stats_fn=maml_stats,
    optimizer_fn=maml_optimizer_fn,
    extra_action_out_fn=vf_preds_fetches,
    postprocess_fn=compute_gae_for_sample_batch,
    compute_gradients_fn=compute_and_clip_gradients,
    before_init=setup_config,
    before_loss_init=setup_mixins,
    mixins=[KLCoeffMixin],
)
Example #25
0
                      config: TrainerConfigDict) -> None:
    """Call all mixin classes' constructors after APPOPolicy initialization.

    Args:
        policy (Policy): The Policy object.
        obs_space (gym.spaces.Space): The Policy's observation space.
        action_space (gym.spaces.Space): The Policy's action space.
        config (TrainerConfigDict): The Policy's config.
    """
    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)


# Build a child class of `DynamicTFPolicy`, given the custom functions defined
# above.
AsyncPPOTFPolicy = build_tf_policy(
    name="AsyncPPOTFPolicy",
    make_model=make_appo_model,
    loss_fn=appo_surrogate_loss,
    stats_fn=stats,
    postprocess_fn=postprocess_trajectory,
    optimizer_fn=choose_optimizer,
    gradients_fn=clip_gradients,
    extra_action_out_fn=add_values,
    before_loss_init=setup_mixins,
    after_init=setup_late_mixins,
    mixins=[
        LearningRateSchedule, KLCoeffMixin, TargetNetworkMixin,
        ValueNetworkMixin
    ],
    get_batch_divisibility_req=lambda p: p.config["rollout_fragment_length"])
Example #26
0
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> None:
    """Call all mixin classes' constructors before SimpleQTFPolicy initialization.

    Args:
        policy (Policy): The Policy object.
        obs_space (gym.spaces.Space): The Policy's observation space.
        action_space (gym.spaces.Space): The Policy's action space.
        config (TrainerConfigDict): The Policy's config.
    """
    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)


# Build a child class of `DynamicTFPolicy`, given the custom functions defined
# above.
SimpleQTFPolicy: Type[DynamicTFPolicy] = build_tf_policy(
    name="SimpleQTFPolicy",
    get_default_config=lambda: ray.rllib.algorithms.dqn.simple_q.
    DEFAULT_CONFIG,
    make_model=build_q_models,
    action_distribution_fn=get_distribution_inputs_and_class,
    loss_fn=build_q_losses,
    extra_action_out_fn=lambda policy: {"q_values": policy.q_values},
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
    after_init=setup_late_mixins,
    mixins=[TargetNetworkMixin],
)
Example #27
0
        return self.kl_coeff_val


def maml_optimizer_fn(policy, config):
    """
    Workers use simple SGD for inner adaptation
    Meta-Policy uses Adam optimizer for meta-update
    """
    if not config["worker_index"]:
        return tf1.train.AdamOptimizer(learning_rate=config["lr"])
    return tf1.train.GradientDescentOptimizer(learning_rate=config["inner_lr"])


def setup_mixins(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    KLCoeffMixin.__init__(policy, config)


MAMLTFPolicy = build_tf_policy(
    name="MAMLTFPolicy",
    get_default_config=lambda: ray.rllib.agents.maml.maml.DEFAULT_CONFIG,
    loss_fn=maml_loss,
    stats_fn=maml_stats,
    optimizer_fn=maml_optimizer_fn,
    extra_action_fetches_fn=vf_preds_fetches,
    postprocess_fn=postprocess_ppo_gae,
    gradients_fn=clip_gradients,
    before_init=setup_config,
    before_loss_init=setup_mixins,
    mixins=[KLCoeffMixin])
Example #28
0
        "obs": obs,
        "is_training": policy._get_is_training_placeholder(),
    }
    model_out, _ = model(input_dict, [], None)
    return model.get_q_values(model_out)


def setup_early_mixins(policy, obs_space, action_space, config):
    ExplorationStateMixin.__init__(policy, obs_space, action_space, config)


def setup_late_mixins(policy, obs_space, action_space, config):
    TargetNetworkMixin.__init__(policy, obs_space, action_space, config)


SimpleQPolicy = build_tf_policy(
    name="SimpleQPolicy",
    get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
    make_model=build_q_models,
    action_sampler_fn=build_action_sampler,
    loss_fn=build_q_losses,
    extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
    before_init=setup_early_mixins,
    after_init=setup_late_mixins,
    obs_include_prev_action_reward=False,
    mixins=[
        ExplorationStateMixin,
        TargetNetworkMixin,
    ])
Example #29
0
                                            batch[SampleBatch.DONES],
                                            batch[PRIO_WEIGHTS])
        new_priorities = (np.abs(convert_to_numpy(td_errors)) +
                          policy.config["prioritized_replay_eps"])
        batch[PRIO_WEIGHTS] = new_priorities

    return batch


DQNTFPolicy = build_tf_policy(
    name="DQNTFPolicy",
    get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
    make_model=build_q_model,
    action_distribution_fn=get_distribution_inputs_and_class,
    loss_fn=build_q_losses,
    stats_fn=build_q_stats,
    postprocess_fn=postprocess_nstep_and_prio,
    optimizer_fn=adam_optimizer,
    gradients_fn=clip_gradients,
    extra_action_out_fn=lambda policy: {"q_values": policy.q_values},
    extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
    before_init=setup_early_mixins,
    before_loss_init=setup_mid_mixins,
    after_init=setup_late_mixins,
    obs_include_prev_action_reward=False,
    mixins=[
        TargetNetworkMixin,
        ComputeTDErrorMixin,
        LearningRateSchedule,
    ])
Example #30
0
def clip_gradients(policy, optimizer, loss):
    grads_and_vars = optimizer.compute_gradients(
        loss, policy.model.trainable_variables())
    grads = [g for (g, v) in grads_and_vars]
    policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
    clipped_grads = list(zip(policy.grads, policy.model.trainable_variables()))
    return clipped_grads


def setup_mixins(policy, obs_space, action_space, config):
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                  config["entropy_coeff_schedule"])


VTraceTFPolicy = build_tf_policy(
    name="VTraceTFPolicy",
    get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
    loss_fn=build_vtrace_loss,
    stats_fn=stats,
    grad_stats_fn=grad_stats,
    postprocess_fn=postprocess_trajectory,
    optimizer_fn=choose_optimizer,
    gradients_fn=clip_gradients,
    extra_action_fetches_fn=add_behaviour_logits,
    before_init=validate_config,
    before_loss_init=setup_mixins,
    mixins=[LearningRateSchedule, EntropyCoeffSchedule],
    get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])