def build_a3c_moa_trainer(moa_config): tf.keras.backend.set_floatx("float32") trainer_name = "MOAA3CTrainer" moa_config["use_gae"] = False a3c_tf_policy = build_tf_policy( name="A3CAuxTFPolicy", get_default_config=lambda: moa_config, loss_fn=actor_critic_loss, stats_fn=stats, grad_stats_fn=grad_stats, gradients_fn=clip_gradients, postprocess_fn=postprocess_a3c_moa, extra_action_fetches_fn=add_value_function_fetch, before_loss_init=setup_mixins, mixins=[ValueNetworkMixin, LearningRateSchedule] + get_moa_mixins(), ) trainer = build_trainer( name=trainer_name, default_policy=a3c_tf_policy, default_config=moa_config, validate_config=validate_config, ) return trainer
def get_moa_vtrace_policy(): moa_vtrace_policy = build_tf_policy( name="MOAVTracePolicy", get_default_config=lambda: MOA_CONFIG, loss_fn=build_vtrace_loss, stats_fn=moa_stats, grad_stats_fn=grad_stats, postprocess_fn=postprocess_trajectory, optimizer_fn=choose_optimizer, gradients_fn=clip_gradients, extra_action_fetches_fn=add_behaviour_logits, before_init=validate_config_policy, before_loss_init=setup_mixins, mixins=[LearningRateSchedule, EntropyCoeffSchedule] + get_moa_mixins(), get_batch_divisibility_req=lambda p: p.config["rollout_fragment_length" ], ) return moa_vtrace_policy
def build_ppo_moa_trainer(moa_config): """ Creates a MOA+PPO policy class, then creates a trainer with this policy. :param moa_config: The configuration dictionary. :return: A new MOA+PPO trainer. """ tf.keras.backend.set_floatx("float32") trainer_name = "MOAPPOTrainer" moa_ppo_policy = build_tf_policy( name="MOAPPOTFPolicy", get_default_config=lambda: moa_config, loss_fn=loss_with_moa, make_model=build_model, stats_fn=extra_moa_stats, extra_action_fetches_fn=extra_moa_fetches, postprocess_fn=postprocess_ppo_moa, gradients_fn=clip_gradients, before_init=setup_config, before_loss_init=setup_ppo_moa_mixins, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ] + get_moa_mixins(), ) moa_ppo_trainer = build_trainer( name=trainer_name, default_policy=moa_ppo_policy, make_policy_optimizer=choose_policy_optimizer, default_config=moa_config, validate_config=validate_ppo_moa_config, after_optimizer_step=update_kl, after_train_result=warn_about_bad_reward_scales, mixins=[MOAResetConfigMixin], ) return moa_ppo_trainer
return penalty logits, _ = model.from_batch(train_batch) action_dist = dist_class(logits, model) actions = train_batch[SampleBatch.ACTIONS] rewards = train_batch[SampleBatch.REWARDS] penalty = tf.py_function(compute_penalty, [actions, rewards], Tout=tf.float32) return penalty - tf.reduce_mean(action_dist.logp(actions) * rewards) # <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'> MyTFPolicy = build_tf_policy( name="MyTFPolicy", loss_fn=policy_gradient_loss, ) # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'> MyTrainer = build_trainer( name="MyCustomTrainer", default_policy=MyTFPolicy, ) if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model("eager_model", EagerModel) config = { "env": "CartPole-v0",
elif isinstance(action_space, (Box, Simplex)) and len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space ({}) of {} has multiple dimensions " "{}. ".format(action_space, policy, action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") # Build a child class of `DynamicTFPolicy`, given the custom functions defined # above. SACTFPolicy = build_tf_policy( name="SACTFPolicy", get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG, make_model=build_sac_model, postprocess_fn=postprocess_trajectory, action_distribution_fn=get_distribution_inputs_and_class, loss_fn=sac_actor_critic_loss, stats_fn=stats, compute_gradients_fn=compute_and_clip_gradients, apply_gradients_fn=apply_gradients, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, mixins=[ TargetNetworkMixin, ActorCriticOptimizerMixin, ComputeTDErrorMixin ], validate_spaces=validate_spaces, before_init=setup_early_mixins, before_loss_init=setup_mid_mixins, after_init=setup_late_mixins, )
"DDPG.".format(action_space, policy)) elif len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space ({}) of {} has multiple dimensions " "{}. ".format(action_space, policy, action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") DDPGTFPolicy = build_tf_policy( name="DDPGTFPolicy", get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, make_model=build_ddpg_models, action_distribution_fn=get_distribution_inputs_and_class, loss_fn=ddpg_actor_critic_loss, stats_fn=build_ddpg_stats, postprocess_fn=postprocess_nstep_and_prio, compute_gradients_fn=gradients_fn, apply_gradients_fn=build_apply_op, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, validate_spaces=validate_spaces, before_init=setup_early_mixins, before_loss_init=setup_mid_mixins, after_init=setup_late_mixins, mixins=[ TargetNetworkMixin, ActorCriticOptimizerMixin, ComputeTDErrorMixin, ], )
self._value = value def setup_config(policy, obs_space, action_space, config): # auto set the model option for layer sharing config["model"]["vf_share_layers"] = config["vf_share_layers"] def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, obs_space, action_space, config) KLCoeffMixin.__init__(policy, config) EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], config["entropy_coeff_schedule"]) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) PPOTFPolicy = build_tf_policy( name="PPOTFPolicy", get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, loss_fn=ppo_surrogate_loss, stats_fn=kl_and_loss_stats, extra_action_fetches_fn=vf_preds_fetches, postprocess_fn=postprocess_ppo_gae, gradients_fn=clip_gradients, before_init=setup_config, before_loss_init=setup_mixins, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ])
return td_err def setup_early_mixins(policy, obs_space, action_space, config): ExplorationStateMixin.__init__(policy, obs_space, action_space, config) ActorCriticOptimizerMixin.__init__(policy, config) def setup_late_mixins(policy, obs_space, action_space, config): TargetNetworkMixin.__init__(policy, config) SACTFPolicy = build_tf_policy( name="SACTFPolicy", get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG, make_model=build_sac_model, postprocess_fn=postprocess_trajectory, extra_action_feed_fn=exploration_setting_inputs, action_sampler_fn=build_action_output, loss_fn=actor_critic_loss, stats_fn=stats, gradients_fn=gradients, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, mixins=[ TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin, ComputeTDErrorMixin ], before_init=setup_early_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False)
config: TrainerConfigDict) -> None: LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) def before_loss_init(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> None: ComputeTDErrorMixin.__init__(policy) TargetNetworkMixin.__init__(policy, obs_space, action_space, config) R2D2TFPolicy = build_tf_policy( name="R2D2TFPolicy", loss_fn=r2d2_loss, get_default_config=lambda: ray.rllib.agents.dqn.r2d2.DEFAULT_CONFIG, postprocess_fn=postprocess_nstep_and_prio, stats_fn=build_q_stats, make_model=build_r2d2_model, action_distribution_fn=get_distribution_inputs_and_class, optimizer_fn=adam_optimizer, extra_action_out_fn=lambda policy: {"q_values": policy.q_values}, compute_gradients_fn=clip_gradients, extra_learn_fetches_fn=lambda policy: {"td_error": policy._td_error}, before_init=setup_early_mixins, before_loss_init=before_loss_init, mixins=[ TargetNetworkMixin, ComputeTDErrorMixin, LearningRateSchedule, ])
return { "policy_loss": policy.loss.p_loss, "vf_loss": policy.loss.v_loss, "total_loss": policy.loss.total_loss, "vf_explained_var": policy.loss.explained_variance, } def setup_mixins(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> None: ValueNetworkMixin.__init__(policy, obs_space, action_space, config) # Set up a tf-var for the moving avg (do this here to make it work with # eager mode); "c^2" in the paper. policy._moving_average_sqd_adv_norm = get_variable( 100.0, framework="tf", tf_name="moving_average_of_advantage_norm", trainable=False) MARWILTFPolicy = build_tf_policy( name="MARWILTFPolicy", get_default_config=lambda: ray.rllib.agents.marwil.marwil.DEFAULT_CONFIG, loss_fn=marwil_loss, stats_fn=stats, postprocess_fn=postprocess_advantages, before_loss_init=setup_mixins, compute_gradients_fn=compute_and_clip_gradients, mixins=[ValueNetworkMixin])
config["epsilon"]) def clip_gradients(policy, optimizer, loss): grads_and_vars = optimizer.compute_gradients( loss, policy.model.trainable_variables()) grads = [g for (g, v) in grads_and_vars] policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) clipped_grads = list(zip(policy.grads, policy.model.trainable_variables())) return clipped_grads def setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], config["entropy_coeff_schedule"]) VTraceTFPolicy = build_tf_policy( name="VTraceTFPolicy", get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG, loss_fn=build_vtrace_loss, stats_fn=stats, grad_stats_fn=grad_stats, postprocess_fn=postprocess_trajectory, optimizer_fn=choose_optimizer, gradients_fn=clip_gradients, before_loss_init=setup_mixins, mixins=[LearningRateSchedule, EntropyCoeffSchedule], get_batch_divisibility_req=lambda p: p.config["rollout_fragment_length"])
model_cls = DiscreteLinearModelThompsonSampling elif exploration_config["type"] == "UpperConfidenceBound": if isinstance(original_space, spaces.Dict): assert ( "item" in original_space.spaces ), "Cannot find 'item' key in observation space" model_cls = ParametricLinearModelUCB else: model_cls = DiscreteLinearModelUCB model = model_cls( obs_space, action_space, logit_dim, config["model"], name="LinearModel" ) return model def after_init(policy, *args): policy.regrets = [] BanditPolicyOverrides.__init__(policy) BanditTFPolicy = build_tf_policy( name="BanditTFPolicy", get_default_config=lambda: ray.rllib.algorithms.bandit.bandit.DEFAULT_CONFIG, validate_spaces=validate_spaces, make_model=make_model, loss_fn=None, mixins=[BanditPolicyOverrides], after_init=after_init, )
from benchmark.networks.communicate import NetworkedMixin, postprocess_trajectory def networked_pg_loss(policy, model, dist_class, train_batch): # make gradients accessed for k in train_batch.keys(): if "var" in k or "gamma" in k: _ = train_batch[k].shape return pg_tf_loss(policy, model, dist_class, train_batch) def setupmixin(policy, obs_space, action_space, config): NetworkedMixin.__init__(policy) NetworkedPG = build_tf_policy( name="NetworkedPG", get_default_config=lambda: PG_DEFAULT_CONFIG, postprocess_fn=postprocess_trajectory, loss_fn=networked_pg_loss, mixins=[NetworkedMixin], after_init=setupmixin, ) NetworkedPGTrainer = build_trainer( name="NetworkedPGTrainer", default_policy=NetworkedPG, )
if get_custom_option(policy, 'use_vf_adv'): vf_preds = trajectory[SampleBatch.VF_PREDS] rewards = (rewards - vf_preds) trajectory[Postprocessing.ADVANTAGES] = rewards.copy().astype(np.float32) return trajectory def extra_action_fetches(policy): fetches = { SampleBatch.VF_PREDS: policy.model.value_function() } if has_method(policy.model, 'extra_compute_action_fetches'): model_fetches = policy.model.extra_compute_action_fetches() fetches.update(model_fetches) return fetches def stats(policy, train_batch): stats = { 'action_logp_min': tf.reduce_min(train_batch[ACTION_LOGP]), 'action_logp_max': tf.reduce_max(train_batch[ACTION_LOGP]), 'action_logp_mean': tf.reduce_mean(train_batch[ACTION_LOGP]), } return stats PGPolicy = build_tf_policy( name='pg_policy', loss_fn=policy_gradient_loss, get_default_config=ConstantFunctor(DEFAULT_CONFIG), postprocess_fn=postprocess_sample_batch, extra_action_fetches_fn=extra_action_fetches, stats_fn=stats )
@override(TFPolicy) def variables(self): return self.model.variables() + self.target_model.variables() def setup_late_mixins(policy, obs_space, action_space, config): TargetNetworkMixin.__init__(policy, config) DDPGTFPolicy = build_tf_policy( name="DQNTFPolicy", get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, make_model=build_ddpg_models, action_distribution_fn=get_distribution_inputs_and_class, loss_fn=ddpg_actor_critic_loss, stats_fn=build_ddpg_stats, postprocess_fn=postprocess_nstep_and_prio, optimizer_fn=make_ddpg_optimizers, gradients_fn=gradients_fn, apply_gradients_fn=build_apply_op, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, before_init=before_init_fn, before_loss_init=setup_mid_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False, mixins=[ TargetNetworkMixin, ComputeTDErrorMixin, ])
def compute_q_values(policy, model, obs, explore, is_training=None): model_out, _ = model( { SampleBatch.CUR_OBS: obs, "is_training": is_training if is_training is not None else policy._get_is_training_placeholder(), }, [], None) return model_out def setup_late_mixins(policy, obs_space, action_space, config): TargetNetworkMixin.__init__(policy, obs_space, action_space, config) SimpleQTFPolicy = build_tf_policy( name="SimpleQTFPolicy", get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, make_model=build_q_models, action_distribution_fn=get_distribution_inputs_and_class, loss_fn=build_q_losses, extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values}, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, after_init=setup_late_mixins, obs_include_prev_action_reward=False, mixins=[TargetNetworkMixin])
new_priorities = (np.abs(td_errors) + policy.config["prioritized_replay_eps"]) batch.data[PRIO_WEIGHTS] = new_priorities return batch DQNTFPolicy = build_tf_policy( name="DQNTFPolicy", get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, make_model=build_q_model, action_sampler_fn=build_q_networks, loss_fn=build_q_losses, stats_fn=build_q_stats, postprocess_fn=postprocess_trajectory, optimizer_fn=adam_optimizer, gradients_fn=clip_gradients, extra_action_feed_fn=exploration_setting_inputs, extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values}, extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error}, update_ops_fn=lambda policy: policy.q_batchnorm_update_ops, before_init=setup_early_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False, mixins=[ ExplorationStateMixin, TargetNetworkMixin, ComputeTDErrorMixin, LearningRateSchedule, ])
DEFAULT_CONFIG = with_common_config({ "gamma": 0.95, "lambda": 1.0, # if gae=true, work for it. "use_gae": False, "vf_loss_coeff": 0.5, "entropy_coeff": 0.01, "truncate_episodes": True, "use_critic": True, "grad_clip": 40.0, "lr": 0.0001, "min_iter_time_s": 5, "sample_async": True, "lr_schedule": None, }) CA2CTFPolicy = build_tf_policy( name="CA2CTFPolicy", stats_fn=stats, grad_stats_fn=central_vf_stats, loss_fn=ac_loss_func, postprocess_fn=postprocess_trajectory, before_loss_init=setup_mixins, make_model=build_cac_model, mixins=[CentralizedValueMixin], get_default_config=lambda: DEFAULT_CONFIG, ) CA2CTrainer = build_trainer(name="CA2C", default_policy=CA2CTFPolicy, default_config=DEFAULT_CONFIG)
return -tf.reduce_mean( action_dist.logp(train_batch["actions"]) * train_batch["returns"]) def calculate_advantages(policy, sample_batch, other_agent_batches=None, episode=None): sample_batch["returns"] = discount_cumsum(sample_batch["rewards"], 0.99) return sample_batch # <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'> MyTFPolicy = build_tf_policy( name="MyTFPolicy", loss_fn=policy_gradient_loss, postprocess_fn=calculate_advantages, ) # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'> MyTrainer = build_trainer( name="MyCustomTrainer", default_policy=MyTFPolicy, ) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) tune.run( MyTrainer, stop={"training_iteration": args.stop_iters},
return # Tf static graph -> Return grouped op. else: alpha_prime_apply_op = policy._alpha_prime_optimizer.apply_gradients( policy._alpha_prime_grads_and_vars, global_step=tf1.train.get_or_create_global_step(), ) return tf.group([sac_results, alpha_prime_apply_op]) return sac_results # Build a child class of `TFPolicy`, given the custom functions defined # above. CQLTFPolicy = build_tf_policy( name="CQLTFPolicy", loss_fn=cql_loss, get_default_config=lambda: ray.rllib.agents.cql.cql.CQL_DEFAULT_CONFIG, validate_spaces=validate_spaces, stats_fn=cql_stats, postprocess_fn=postprocess_trajectory, before_init=setup_early_mixins, after_init=setup_late_mixins, make_model=build_sac_model, mixins=[ ActorCriticOptimizerMixin, TargetNetworkMixin, ComputeTDErrorMixin ], action_distribution_fn=get_distribution_inputs_and_class, compute_gradients_fn=compute_gradients_fn, apply_gradients_fn=apply_gradients_fn, )
return loss def pg_loss_stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]: """Returns the calculated loss in a stats dict. Args: policy (Policy): The Policy object. train_batch (SampleBatch): The data used for training. Returns: Dict[str, TensorType]: The stats dict. """ return { "policy_loss": policy.policy_loss, } # Build a child class of `DynamicTFPolicy`, given the extra options: # - trajectory post-processing function (to calculate advantages) # - PG loss function PGTFPolicy = build_tf_policy( name="PGTFPolicy", get_default_config=lambda: ray.rllib.agents.pg.DEFAULT_CONFIG, postprocess_fn=post_process_advantages, stats_fn=pg_loss_stats, loss_fn=pg_tf_loss)
self.get_placeholder(SampleBatch.CUR_OBS): [ob], self.seq_lens: [1] } assert len(args) == len(self.state_in), \ (args, self.state_in) for k, v in zip(self.state_in, args): feed_dict[k] = v vf = self.get_session().run(self.value_function, feed_dict) return vf[0] def setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) ValueNetworkMixin.__init__(policy) AsyncPPOTFPolicy = build_tf_policy( name="AsyncPPOTFPolicy", get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG, loss_fn=build_appo_surrogate_loss, stats_fn=stats, grad_stats_fn=grad_stats, postprocess_fn=postprocess_trajectory, optimizer_fn=choose_optimizer, gradients_fn=clip_gradients, extra_action_fetches_fn=add_values_and_logits, before_init=validate_config, before_loss_init=setup_mixins, mixins=[LearningRateSchedule, ValueNetworkMixin], get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
explained_variance( policy.get_placeholder(Postprocessing.VALUE_TARGETS), policy.vf), } def clip_gradients(policy, optimizer, loss): grads = tf.gradients(loss, policy.var_list) grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) clipped_grads = list(zip(grads, policy.var_list)) return clipped_grads def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) A3CTFPolicy = build_tf_policy( name="A3CTFPolicy", get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, loss_fn=actor_critic_loss, stats_fn=stats, grad_stats_fn=grad_stats, gradients_fn=clip_gradients, postprocess_fn=postprocess_advantages, extra_action_fetches_fn=add_value_function_fetch, before_loss_init=setup_mixins, mixins=[ValueNetworkMixin, LearningRateSchedule])
def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, config) KLCoeffMixin.__init__(policy, config) # Create the `split` placeholder. policy._loss_input_dict["split"] = tf1.placeholder( tf.int32, name="Meta-Update-Splitting", shape=( policy.config["inner_adaptation_steps"] + 1, policy.config["num_workers"], ), ) MAMLTFPolicy = build_tf_policy( name="MAMLTFPolicy", get_default_config=lambda: ray.rllib.algorithms.maml.maml.DEFAULT_CONFIG, loss_fn=maml_loss, stats_fn=maml_stats, optimizer_fn=maml_optimizer_fn, extra_action_out_fn=vf_preds_fetches, postprocess_fn=compute_gae_for_sample_batch, compute_gradients_fn=compute_and_clip_gradients, before_init=setup_config, before_loss_init=setup_mixins, mixins=[KLCoeffMixin], )
config: TrainerConfigDict) -> None: """Call all mixin classes' constructors after APPOPolicy initialization. Args: policy (Policy): The Policy object. obs_space (gym.spaces.Space): The Policy's observation space. action_space (gym.spaces.Space): The Policy's action space. config (TrainerConfigDict): The Policy's config. """ TargetNetworkMixin.__init__(policy, obs_space, action_space, config) # Build a child class of `DynamicTFPolicy`, given the custom functions defined # above. AsyncPPOTFPolicy = build_tf_policy( name="AsyncPPOTFPolicy", make_model=make_appo_model, loss_fn=appo_surrogate_loss, stats_fn=stats, postprocess_fn=postprocess_trajectory, optimizer_fn=choose_optimizer, gradients_fn=clip_gradients, extra_action_out_fn=add_values, before_loss_init=setup_mixins, after_init=setup_late_mixins, mixins=[ LearningRateSchedule, KLCoeffMixin, TargetNetworkMixin, ValueNetworkMixin ], get_batch_divisibility_req=lambda p: p.config["rollout_fragment_length"])
policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> None: """Call all mixin classes' constructors before SimpleQTFPolicy initialization. Args: policy (Policy): The Policy object. obs_space (gym.spaces.Space): The Policy's observation space. action_space (gym.spaces.Space): The Policy's action space. config (TrainerConfigDict): The Policy's config. """ TargetNetworkMixin.__init__(policy, obs_space, action_space, config) # Build a child class of `DynamicTFPolicy`, given the custom functions defined # above. SimpleQTFPolicy: Type[DynamicTFPolicy] = build_tf_policy( name="SimpleQTFPolicy", get_default_config=lambda: ray.rllib.algorithms.dqn.simple_q. DEFAULT_CONFIG, make_model=build_q_models, action_distribution_fn=get_distribution_inputs_and_class, loss_fn=build_q_losses, extra_action_out_fn=lambda policy: {"q_values": policy.q_values}, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, after_init=setup_late_mixins, mixins=[TargetNetworkMixin], )
return self.kl_coeff_val def maml_optimizer_fn(policy, config): """ Workers use simple SGD for inner adaptation Meta-Policy uses Adam optimizer for meta-update """ if not config["worker_index"]: return tf1.train.AdamOptimizer(learning_rate=config["lr"]) return tf1.train.GradientDescentOptimizer(learning_rate=config["inner_lr"]) def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy, obs_space, action_space, config) KLCoeffMixin.__init__(policy, config) MAMLTFPolicy = build_tf_policy( name="MAMLTFPolicy", get_default_config=lambda: ray.rllib.agents.maml.maml.DEFAULT_CONFIG, loss_fn=maml_loss, stats_fn=maml_stats, optimizer_fn=maml_optimizer_fn, extra_action_fetches_fn=vf_preds_fetches, postprocess_fn=postprocess_ppo_gae, gradients_fn=clip_gradients, before_init=setup_config, before_loss_init=setup_mixins, mixins=[KLCoeffMixin])
"obs": obs, "is_training": policy._get_is_training_placeholder(), } model_out, _ = model(input_dict, [], None) return model.get_q_values(model_out) def setup_early_mixins(policy, obs_space, action_space, config): ExplorationStateMixin.__init__(policy, obs_space, action_space, config) def setup_late_mixins(policy, obs_space, action_space, config): TargetNetworkMixin.__init__(policy, obs_space, action_space, config) SimpleQPolicy = build_tf_policy( name="SimpleQPolicy", get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, make_model=build_q_models, action_sampler_fn=build_action_sampler, loss_fn=build_q_losses, extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values}, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, before_init=setup_early_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False, mixins=[ ExplorationStateMixin, TargetNetworkMixin, ])
batch[SampleBatch.DONES], batch[PRIO_WEIGHTS]) new_priorities = (np.abs(convert_to_numpy(td_errors)) + policy.config["prioritized_replay_eps"]) batch[PRIO_WEIGHTS] = new_priorities return batch DQNTFPolicy = build_tf_policy( name="DQNTFPolicy", get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, make_model=build_q_model, action_distribution_fn=get_distribution_inputs_and_class, loss_fn=build_q_losses, stats_fn=build_q_stats, postprocess_fn=postprocess_nstep_and_prio, optimizer_fn=adam_optimizer, gradients_fn=clip_gradients, extra_action_out_fn=lambda policy: {"q_values": policy.q_values}, extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error}, before_init=setup_early_mixins, before_loss_init=setup_mid_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False, mixins=[ TargetNetworkMixin, ComputeTDErrorMixin, LearningRateSchedule, ])
def clip_gradients(policy, optimizer, loss): grads_and_vars = optimizer.compute_gradients( loss, policy.model.trainable_variables()) grads = [g for (g, v) in grads_and_vars] policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) clipped_grads = list(zip(policy.grads, policy.model.trainable_variables())) return clipped_grads def setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], config["entropy_coeff_schedule"]) VTraceTFPolicy = build_tf_policy( name="VTraceTFPolicy", get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG, loss_fn=build_vtrace_loss, stats_fn=stats, grad_stats_fn=grad_stats, postprocess_fn=postprocess_trajectory, optimizer_fn=choose_optimizer, gradients_fn=clip_gradients, extra_action_fetches_fn=add_behaviour_logits, before_init=validate_config, before_loss_init=setup_mixins, mixins=[LearningRateSchedule, EntropyCoeffSchedule], get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])