コード例 #1
0
def build_config(ckpt,
                 extra_config=None,
                 is_es_agent=False,
                 change_model=None,
                 use_activation_model=True):
    if extra_config is None:
        extra_config = {}
    config = {"log_level": "ERROR"}
    if ckpt is not None:
        ckpt = os.path.abspath(os.path.expanduser(ckpt))  # Remove relative dir
        # config = {"log_level": "ERROR"}
        # Load configuration from file
        config_dir = os.path.dirname(ckpt)
        config_path = os.path.join(config_dir, "params.pkl")
        if not os.path.exists(config_path):
            config_path = os.path.join(config_dir, "../params.pkl")
        if os.path.exists(config_path):
            with open(config_path, "rb") as f:
                old_config = pickle.load(f)
                old_config.update(copy.deepcopy(config))
                config = copy.deepcopy(old_config)
    if "num_workers" in config:
        config["num_workers"] = min(1, config["num_workers"])
    if is_es_agent or (not use_activation_model):
        args_config = {}
    else:
        args_config = {"model": fc_with_activation_model_config}
    if has_gpu():
        args_config.update({"num_gpus_per_worker": 0.1})
    config = merge_dicts(config, args_config)
    config = merge_dicts(config, extra_config)
    if is_es_agent:
        config['num_workers'] = 1
        config['num_gpus_per_worker'] = 0
        config["num_gpus"] = 0
    if change_model:
        assert isinstance(change_model, str)
        config['model']['custom_model'] = change_model
    return config
コード例 #2
0
dice_sac_default_config = merge_dicts(
    sac_default_config,
    {

        # PPO loss for diversity
        # "clip_param": 0.3,
        # "lambda": 1.0,
        "grad_clip": 40.0,

        # "rollout_fragment_length": 50,
        constants.USE_BISECTOR: True,
        constants.USE_DIVERSITY_VALUE_NETWORK: False,
        constants.DELAY_UPDATE: True,
        # constants.TWO_SIDE_CLIP_LOSS: True,
        constants.ONLY_TNB: False,
        constants.NORMALIZE_ADVANTAGE: False,
        constants.CLIP_DIVERSITY_GRADIENT: True,
        constants.DIVERSITY_REWARD_TYPE: "mse",
        constants.PURE_OFF_POLICY: False,
        "normalize_actions": False,
        "env_config": {
            "normalize_actions": False
        },

        # "tau": 5e-3,  # <<== SAC already have this
        "callbacks": {
            # "on_train_result": constants.on_train_result,
            "on_postprocess_traj": constants.on_postprocess_traj
        }
    })
コード例 #3
0
NORMALIZE_ADVANTAGE = "normalize_advantage"

dice_appo_default_config = merge_dicts(
    APPO_DEFAULT,
    {
        USE_BISECTOR: True,
        USE_DIVERSITY_VALUE_NETWORK: False,
        DELAY_UPDATE: True,
        NORMALIZE_ADVANTAGE: False,
        CLIP_DIVERSITY_GRADIENT: True,
        DIVERSITY_REWARD_TYPE: "mse",
        "num_agents": 1,  # Control the agent population size
        "num_sgd_iter": 10,  # In PPO this is 10
        "train_batch_size": 500,
        "sample_batch_size": 50,
        "tau": 5e-3,
        "clip_param": 0.3,
        "lr": 5e-4,
        "max_sample_requests_in_flight_per_worker": 2,  # originally 2
        "shuffle_sequences": True,
        "sgd_minibatch_size": 200,
        "sync_sampling": False,
        "vf_share_layers": False

        # "replay_buffer_num_slots": 0,  # disable replay
        # "broadcast_interval": 1,
        # "num_data_loader_buffers": 1,
        # "vf_loss_coeff": 0.5,
        # "vtrace": False,
    })
コード例 #4
0
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    KLCoeffMixin.__init__(policy, config)
    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                  config["entropy_coeff_schedule"])
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
    AddMaskInfoMixinForPolicy.__init__(policy)


fc_with_mask_model_config = {
    "model": {
        "custom_model": "fc_with_mask",
        "custom_options": {}
    }
}

ppo_agent_default_config_with_mask = merge_dicts(DEFAULT_CONFIG,
                                                 fc_with_mask_model_config)

PPOTFPolicyWithMask = PPOTFPolicy.with_updates(
    name="PPOTFPolicyWithMask",
    get_default_config=lambda: ppo_agent_default_config_with_mask,
    extra_action_fetches_fn=vf_preds_and_logits_fetches_new,
    before_loss_init=setup_mixins,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        ValueNetworkMixin, AddMaskInfoMixinForPolicy
    ])


class AddMaskInfoMixin(object):
    def get_mask_info(self):
        return self.get_mask()
コード例 #5
0
from toolbox.task_novelty_bisector.tnb_utils import *
from toolbox.utils import merge_dicts

tf = try_import_tf()
logger = logging.getLogger(__name__)

tnb_default_config = merge_dicts(
    DEFAULT_CONFIG,
    dict(
        novelty_threshold=0.5,
        use_preoccupied_agent=False,
        disable_tnb=False,
        use_tnb_plus=True,
        checkpoint_dict="{}",  # use json to parse a dict into string.
        # disabling novelty value network can save the cost of extra NN and
        # prevents misleading novelty policy gradient.
        use_novelty_value_network=True,

        # Do not modified these parameters.
        distance_mode="min",
        tnb_plus_threshold=0.0,
        clip_novelty_gradient=False,
        use_second_component=True,
        model={"custom_model": "ActorDoubleCriticNetwork"},
        callbacks={"on_train_result": on_train_result}))

ModelCatalog.register_custom_model("ActorDoubleCriticNetwork",
                                   ActorDoubleCriticNetwork)


def get_action_mean(logits):
コード例 #6
0
DIVERSITY_VALUES = "diversity_values"
DIVERSITY_ADVANTAGES = "diversity_advantages"
DIVERSITY_VALUE_TARGETS = "diversity_value_targets"
PURE_OFF_POLICY = "pure_off_policy"
NORMALIZE_ADVANTAGE = "normalize_advantage"

dice_default_config = merge_dicts(
    PPO_DEFAULT, {
        USE_BISECTOR: True,
        USE_DIVERSITY_VALUE_NETWORK: False,
        DELAY_UPDATE: True,
        TWO_SIDE_CLIP_LOSS: True,
        ONLY_TNB: False,
        NORMALIZE_ADVANTAGE: False,
        CLIP_DIVERSITY_GRADIENT: True,
        DIVERSITY_REWARD_TYPE: "mse",
        PURE_OFF_POLICY: False,
        "tau": 5e-3,
        "vf_ratio_clip_param": 0.05,
        "callbacks": {
            "on_train_result": on_train_result,
            "on_postprocess_traj": on_postprocess_traj
        },
        "grad_clip": 10.0
    })


def get_kl_divergence(source, target, mean=True):
    assert source.ndim == 2
    assert target.ndim == 2
コード例 #7
0
# Frozen logits of the policy that computed the action
BEHAVIOUR_LOGITS = "behaviour_logits"
OPPONENT_OBS = "opponent_obs"
OPPONENT_ACTION = "opponent_action"

PEER_ACTION = "other_replay"
JOINT_OBS = "joint_obs"
# NO_SPLIT_OBS = "no_split_obs"

mixin_list = [
    LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin
]

extra_loss_ppo_default_config = merge_dicts(
    DEFAULT_CONFIG,
    dict(novelty_loss_param=0.5,
         joint_dataset_sample_batch_size=200,
         novelty_mode="mean",
         use_joint_dataset=True))


def postprocess_ppo_gae_modified(policy,
                                 sample_batch,
                                 other_agent_batches=None,
                                 episode=None):
    """This function add extra placeholder, by creating new entries in batch
    which the following RLLib procedure would detect and create placeholder
    based on the shape of them."""
    batch = postprocess_ppo_gae(policy, sample_batch, other_agent_batches,
                                episode)
    if not policy.loss_initialized():
        batch[JOINT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS],
コード例 #8
0
dece_default_config = merge_dicts(
    DEFAULT_CONFIG,
    dict(
        tau=5e-3,
        callbacks={
            "on_train_result": on_train_result,
            "on_postprocess_traj": on_postprocess_traj
        },
        **{
            DIVERSITY_ENCOURAGING: True,
            USE_BISECTOR: True,
            USE_DIVERSITY_VALUE_NETWORK: True,
            CLIP_DIVERSITY_GRADIENT: True,
            DELAY_UPDATE: True,
            DIVERSITY_REWARD_TYPE: "mse",
            REPLAY_VALUES: False,
            TWO_SIDE_CLIP_LOSS: True,
            I_AM_CLONE: False,
            ONLY_TNB: False,
            CONSTRAIN_NOVELTY: "soft",
            PURE_OFF_POLICY: False,
            NORMALIZE_ADVANTAGE: True,

            # vtrace
            # "use_vtrace": False,
            'use_kl_loss': True,
            "clip_rho_threshold": 1.0,  # TODO
            "clip_pg_rho_threshold": 1.0,  # TODO
            # "normalize_advantage": True,
            "novelty_target_multiplier": 1.0,
            "novelty_stat_length": 100,
            "alpha_coefficient": 0.01,
        }))
コード例 #9
0
from toolbox import initialize_ray
from toolbox.distance import get_kl_divergence
from toolbox.ipd.tnb import validate_config as validate_config_TNBTrainer
from toolbox.ipd.tnb_policy import NoveltyValueNetworkMixin, TNBPolicy, \
    tnb_default_config, BEHAVIOUR_LOGITS
from toolbox.ipd.tnb_utils import *
from toolbox.marl import MultiAgentEnvWrapper
from toolbox.modified_rllib.multi_gpu_optimizer import \
    LocalMultiGPUOptimizerCorrectedNumberOfSampled
from toolbox.dies.ppo_es import PPOESTrainer, \
    validate_config as validate_config_PPOESTrainer

tnbes_config = merge_dicts(
    merge_dicts(DEFAULT_CONFIG, tnb_default_config),
    dict(
        update_steps=100000,
        use_tnb_plus=False,
        novelty_type="mse",  # must in ['mse', 'kl']
        use_novelty_value_network=False))
"""
The main different between the TNBTrainer at toolbox.ipd and here is that
the weight swapping operation is done by passing checkpoint_dict in config
at TNBTrainer. But we do the weight sharing between policies inplace 
immediately after train iteration.

TNBESPolicy remove the AgentPoolMixin.

TNBESTrainer merge the TNBTrainer and PPOESTrainer.
"""