def build_config(ckpt, extra_config=None, is_es_agent=False, change_model=None, use_activation_model=True): if extra_config is None: extra_config = {} config = {"log_level": "ERROR"} if ckpt is not None: ckpt = os.path.abspath(os.path.expanduser(ckpt)) # Remove relative dir # config = {"log_level": "ERROR"} # Load configuration from file config_dir = os.path.dirname(ckpt) config_path = os.path.join(config_dir, "params.pkl") if not os.path.exists(config_path): config_path = os.path.join(config_dir, "../params.pkl") if os.path.exists(config_path): with open(config_path, "rb") as f: old_config = pickle.load(f) old_config.update(copy.deepcopy(config)) config = copy.deepcopy(old_config) if "num_workers" in config: config["num_workers"] = min(1, config["num_workers"]) if is_es_agent or (not use_activation_model): args_config = {} else: args_config = {"model": fc_with_activation_model_config} if has_gpu(): args_config.update({"num_gpus_per_worker": 0.1}) config = merge_dicts(config, args_config) config = merge_dicts(config, extra_config) if is_es_agent: config['num_workers'] = 1 config['num_gpus_per_worker'] = 0 config["num_gpus"] = 0 if change_model: assert isinstance(change_model, str) config['model']['custom_model'] = change_model return config
dice_sac_default_config = merge_dicts( sac_default_config, { # PPO loss for diversity # "clip_param": 0.3, # "lambda": 1.0, "grad_clip": 40.0, # "rollout_fragment_length": 50, constants.USE_BISECTOR: True, constants.USE_DIVERSITY_VALUE_NETWORK: False, constants.DELAY_UPDATE: True, # constants.TWO_SIDE_CLIP_LOSS: True, constants.ONLY_TNB: False, constants.NORMALIZE_ADVANTAGE: False, constants.CLIP_DIVERSITY_GRADIENT: True, constants.DIVERSITY_REWARD_TYPE: "mse", constants.PURE_OFF_POLICY: False, "normalize_actions": False, "env_config": { "normalize_actions": False }, # "tau": 5e-3, # <<== SAC already have this "callbacks": { # "on_train_result": constants.on_train_result, "on_postprocess_traj": constants.on_postprocess_traj } })
NORMALIZE_ADVANTAGE = "normalize_advantage" dice_appo_default_config = merge_dicts( APPO_DEFAULT, { USE_BISECTOR: True, USE_DIVERSITY_VALUE_NETWORK: False, DELAY_UPDATE: True, NORMALIZE_ADVANTAGE: False, CLIP_DIVERSITY_GRADIENT: True, DIVERSITY_REWARD_TYPE: "mse", "num_agents": 1, # Control the agent population size "num_sgd_iter": 10, # In PPO this is 10 "train_batch_size": 500, "sample_batch_size": 50, "tau": 5e-3, "clip_param": 0.3, "lr": 5e-4, "max_sample_requests_in_flight_per_worker": 2, # originally 2 "shuffle_sequences": True, "sgd_minibatch_size": 200, "sync_sampling": False, "vf_share_layers": False # "replay_buffer_num_slots": 0, # disable replay # "broadcast_interval": 1, # "num_data_loader_buffers": 1, # "vf_loss_coeff": 0.5, # "vtrace": False, })
ValueNetworkMixin.__init__(policy, obs_space, action_space, config) KLCoeffMixin.__init__(policy, config) EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], config["entropy_coeff_schedule"]) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) AddMaskInfoMixinForPolicy.__init__(policy) fc_with_mask_model_config = { "model": { "custom_model": "fc_with_mask", "custom_options": {} } } ppo_agent_default_config_with_mask = merge_dicts(DEFAULT_CONFIG, fc_with_mask_model_config) PPOTFPolicyWithMask = PPOTFPolicy.with_updates( name="PPOTFPolicyWithMask", get_default_config=lambda: ppo_agent_default_config_with_mask, extra_action_fetches_fn=vf_preds_and_logits_fetches_new, before_loss_init=setup_mixins, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin, AddMaskInfoMixinForPolicy ]) class AddMaskInfoMixin(object): def get_mask_info(self): return self.get_mask()
from toolbox.task_novelty_bisector.tnb_utils import * from toolbox.utils import merge_dicts tf = try_import_tf() logger = logging.getLogger(__name__) tnb_default_config = merge_dicts( DEFAULT_CONFIG, dict( novelty_threshold=0.5, use_preoccupied_agent=False, disable_tnb=False, use_tnb_plus=True, checkpoint_dict="{}", # use json to parse a dict into string. # disabling novelty value network can save the cost of extra NN and # prevents misleading novelty policy gradient. use_novelty_value_network=True, # Do not modified these parameters. distance_mode="min", tnb_plus_threshold=0.0, clip_novelty_gradient=False, use_second_component=True, model={"custom_model": "ActorDoubleCriticNetwork"}, callbacks={"on_train_result": on_train_result})) ModelCatalog.register_custom_model("ActorDoubleCriticNetwork", ActorDoubleCriticNetwork) def get_action_mean(logits):
DIVERSITY_VALUES = "diversity_values" DIVERSITY_ADVANTAGES = "diversity_advantages" DIVERSITY_VALUE_TARGETS = "diversity_value_targets" PURE_OFF_POLICY = "pure_off_policy" NORMALIZE_ADVANTAGE = "normalize_advantage" dice_default_config = merge_dicts( PPO_DEFAULT, { USE_BISECTOR: True, USE_DIVERSITY_VALUE_NETWORK: False, DELAY_UPDATE: True, TWO_SIDE_CLIP_LOSS: True, ONLY_TNB: False, NORMALIZE_ADVANTAGE: False, CLIP_DIVERSITY_GRADIENT: True, DIVERSITY_REWARD_TYPE: "mse", PURE_OFF_POLICY: False, "tau": 5e-3, "vf_ratio_clip_param": 0.05, "callbacks": { "on_train_result": on_train_result, "on_postprocess_traj": on_postprocess_traj }, "grad_clip": 10.0 }) def get_kl_divergence(source, target, mean=True): assert source.ndim == 2 assert target.ndim == 2
# Frozen logits of the policy that computed the action BEHAVIOUR_LOGITS = "behaviour_logits" OPPONENT_OBS = "opponent_obs" OPPONENT_ACTION = "opponent_action" PEER_ACTION = "other_replay" JOINT_OBS = "joint_obs" # NO_SPLIT_OBS = "no_split_obs" mixin_list = [ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin ] extra_loss_ppo_default_config = merge_dicts( DEFAULT_CONFIG, dict(novelty_loss_param=0.5, joint_dataset_sample_batch_size=200, novelty_mode="mean", use_joint_dataset=True)) def postprocess_ppo_gae_modified(policy, sample_batch, other_agent_batches=None, episode=None): """This function add extra placeholder, by creating new entries in batch which the following RLLib procedure would detect and create placeholder based on the shape of them.""" batch = postprocess_ppo_gae(policy, sample_batch, other_agent_batches, episode) if not policy.loss_initialized(): batch[JOINT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS],
dece_default_config = merge_dicts( DEFAULT_CONFIG, dict( tau=5e-3, callbacks={ "on_train_result": on_train_result, "on_postprocess_traj": on_postprocess_traj }, **{ DIVERSITY_ENCOURAGING: True, USE_BISECTOR: True, USE_DIVERSITY_VALUE_NETWORK: True, CLIP_DIVERSITY_GRADIENT: True, DELAY_UPDATE: True, DIVERSITY_REWARD_TYPE: "mse", REPLAY_VALUES: False, TWO_SIDE_CLIP_LOSS: True, I_AM_CLONE: False, ONLY_TNB: False, CONSTRAIN_NOVELTY: "soft", PURE_OFF_POLICY: False, NORMALIZE_ADVANTAGE: True, # vtrace # "use_vtrace": False, 'use_kl_loss': True, "clip_rho_threshold": 1.0, # TODO "clip_pg_rho_threshold": 1.0, # TODO # "normalize_advantage": True, "novelty_target_multiplier": 1.0, "novelty_stat_length": 100, "alpha_coefficient": 0.01, }))
from toolbox import initialize_ray from toolbox.distance import get_kl_divergence from toolbox.ipd.tnb import validate_config as validate_config_TNBTrainer from toolbox.ipd.tnb_policy import NoveltyValueNetworkMixin, TNBPolicy, \ tnb_default_config, BEHAVIOUR_LOGITS from toolbox.ipd.tnb_utils import * from toolbox.marl import MultiAgentEnvWrapper from toolbox.modified_rllib.multi_gpu_optimizer import \ LocalMultiGPUOptimizerCorrectedNumberOfSampled from toolbox.dies.ppo_es import PPOESTrainer, \ validate_config as validate_config_PPOESTrainer tnbes_config = merge_dicts( merge_dicts(DEFAULT_CONFIG, tnb_default_config), dict( update_steps=100000, use_tnb_plus=False, novelty_type="mse", # must in ['mse', 'kl'] use_novelty_value_network=False)) """ The main different between the TNBTrainer at toolbox.ipd and here is that the weight swapping operation is done by passing checkpoint_dict in config at TNBTrainer. But we do the weight sharing between policies inplace immediately after train iteration. TNBESPolicy remove the AgentPoolMixin. TNBESTrainer merge the TNBTrainer and PPOESTrainer. """