コード例 #1
0
ファイル: test_wrappers.py プロジェクト: tobiasbaumann1/amd
def test_add_RewardUncertaintyEnvClassWrapper():
    max_steps, grid_size = 20, 3
    n_steps = int(max_steps * 8.25)
    reward_uncertainty_mean, reward_uncertainty_std = 10, 1
    MyCoinGame = add_RewardUncertaintyEnvClassWrapper(CoinGame, reward_uncertainty_std, reward_uncertainty_mean)
    MyAsymCoinGame = add_RewardUncertaintyEnvClassWrapper(AsymCoinGame, reward_uncertainty_std, reward_uncertainty_mean)
    coin_game = init_env(max_steps, MyCoinGame, grid_size)
    asymm_coin_game = init_env(max_steps, MyAsymCoinGame, grid_size)

    all_rewards = []
    for env in [coin_game, asymm_coin_game]:
        obs = env.reset()

        step_i = 0
        for _ in range(n_steps):
            step_i += 1
            actions = {policy_id: random.randint(0, env.NUM_ACTIONS - 1) for policy_id in env.players_ids}
            obs, reward, done, info = env.step(actions)
            print("reward", reward)
            all_rewards.append(reward[env.player_red_id])
            all_rewards.append(reward[env.player_blue_id])

            if done["__all__"]:
                obs = env.reset()
                step_i = 0

    assert np.array(all_rewards).mean() > reward_uncertainty_mean - 1.0
    assert np.array(all_rewards).mean() < reward_uncertainty_mean + 1.0

    assert np.array(all_rewards).std() > reward_uncertainty_std - 0.1
    assert np.array(all_rewards).std() < reward_uncertainty_mean + 0.1
コード例 #2
0
def modify_hyperparams_for_the_selected_env(hp):
    hp["plot_keys"] = (amTFT.PLOT_KEYS +
                       aggregate_and_plot_tensorboard_data.PLOT_KEYS)
    hp["plot_assemblage_tags"] = (
        amTFT.PLOT_ASSEMBLAGE_TAGS +
        aggregate_and_plot_tensorboard_data.PLOT_ASSEMBLAGE_TAGS)
    mul_temp = 1.0

    hp["punishment_multiplier"] = 3.0
    hp["buf_frac"] = 0.125
    hp["training_intensity"] = 10
    # hp["rollout_length"] = 40
    # hp["n_rollout_replicas"] = 20
    hp["rollout_length"] = 4
    hp["n_rollout_replicas"] = 5

    if "CoinGame" in hp["env_name"]:
        hp["plot_keys"] += vectorized_coin_game.PLOT_KEYS
        hp["plot_assemblage_tags"] += vectorized_coin_game.PLOT_ASSEMBLAGE_TAGS

        hp["n_steps_per_epi"] = 20 if hp["debug"] else 100
        hp["n_epi"] = 10 if hp["debug"] else 4000
        hp["base_lr"] = 0.1
        hp["bs_epi_mul"] = 1
        hp["both_players_can_pick_the_same_coin"] = False
        hp["sgd_momentum"] = 0.9

        hp["lambda"] = 0.96
        hp["alpha"] = 0.0
        hp["beta"] = 0.5

        hp["debit_threshold"] = 30.0
        hp["jitter"] = 0.02
        hp["filter_utilitarian"] = False

        hp["target_network_update_freq"] = 100 * hp["n_steps_per_epi"]
        hp["last_exploration_temp_value"] = 0.03 * mul_temp

        hp["temperature_schedule"] = PiecewiseSchedule(
            endpoints=[
                (0, 2.0 * mul_temp),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.20),
                    0.5 * mul_temp,
                ),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.60),
                    hp["last_exploration_temp_value"],
                ),
            ],
            outside_value=hp["last_exploration_temp_value"],
            framework="torch",
        )

        if "AsymCoinGame" in hp["env_name"]:
            hp["x_limits"] = (-0.5, 3.0)
            hp["y_limits"] = (-1.1, 0.6)
            hp["env_class"] = vectorized_coin_game.AsymVectorizedCoinGame
        elif "MixedMotiveCoinGame" in hp["env_name"]:
            if "SSDMixedMotiveCoinGame" in hp["env_name"]:
                hp["debit_threshold"] = 3.0
                hp["x_limits"] = (-0.25, 1.0)
                hp["y_limits"] = (-0.25, 1.5)
                hp["env_class"] = ssd_mixed_motive_coin_game.SSDMixedMotiveCoinGame
            else:
                hp["x_limits"] = (-2.0, 2.0)
                hp["y_limits"] = (-0.5, 3.0)
                hp["env_class"] = vectorized_mixed_motive_coin_game.VectMixedMotiveCG
            hp["both_players_can_pick_the_same_coin"] = True
        else:
            hp["x_limits"] = (-0.5, 0.6)
            hp["y_limits"] = (-0.5, 0.6)
            hp["env_class"] = vectorized_coin_game.VectorizedCoinGame
    else:

        hp["plot_keys"] += matrix_sequential_social_dilemma.PLOT_KEYS
        hp["plot_assemblage_tags"] += matrix_sequential_social_dilemma.PLOT_ASSEMBLAGE_TAGS

        hp["base_lr"] = 0.03
        hp["bs_epi_mul"] = 1
        hp["n_steps_per_epi"] = 20
        hp["n_epi"] = 10 if hp["debug"] else 800
        hp["lambda"] = 0.96
        hp["alpha"] = 0.0
        hp["beta"] = 1.0
        hp["sgd_momentum"] = 0.0

        hp["debit_threshold"] = 10.0

        hp["target_network_update_freq"] = 30 * hp["n_steps_per_epi"]
        hp["last_exploration_temp_value"] = 0.1 * mul_temp

        hp["temperature_schedule"] = PiecewiseSchedule(
            endpoints=[
                (0, 2.0 * mul_temp),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33),
                    0.5 * mul_temp,
                ),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66),
                    hp["last_exploration_temp_value"],
                ),
            ],
            outside_value=hp["last_exploration_temp_value"],
            framework="torch",
        )

        if "IteratedPrisonersDilemma" in hp["env_name"]:
            hp["filter_utilitarian"] = False
            hp["x_limits"] = (-3.5, 0.5)
            hp["y_limits"] = (-3.5, 0.5)
            hp["utilitarian_filtering_threshold"] = -2.5
            hp["env_class"] = matrix_sequential_social_dilemma.IteratedPrisonersDilemma
        elif "IteratedAsymBoS" in hp["env_name"]:
            hp["x_limits"] = (-0.1, 4.1)
            hp["y_limits"] = (-0.1, 4.1)
            hp["utilitarian_filtering_threshold"] = 3.2
            hp["env_class"] = matrix_sequential_social_dilemma.IteratedAsymBoS
        else:
            raise NotImplementedError(f'hp["env_name"]: {hp["env_name"]}')

    hp["lr_schedule"] = [
        (0, 0.0),
        (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.05), hp["base_lr"]),
        (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9),
    ]

    hp["plot_axis_scale_multipliers"] = (
        (1 / hp["n_steps_per_epi"]),  # for x axis
        (1 / hp["n_steps_per_epi"]),
    )  # for y axis

    hp["env_class"] = add_RewardUncertaintyEnvClassWrapper(
        env_class=hp["env_class"],
        reward_uncertainty_std=hp["reward_uncertainty"],
    )

    return hp
コード例 #3
0
def get_rllib_config(hp: dict):
    stop = {
        "episodes_total": hp["n_epi"],  # 4000 steps in 200 epi
    }

    env_config = {
        "players_ids": ["player_row", "player_col"],
        "max_steps": hp["n_steps_per_epi"],
    }

    MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
        optimizer_fn=sgd_optimizer_dqn,
        stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats))

    ltft_config = merge_dicts(
        LTFT_DEFAULT_CONFIG_UPDATE,
        {
            "sgd_momentum": 0.9,
            'nested_policies': [
                # Here the trainer need to be a DQNTrainer to provide the config for the 3 DQNTorchPolicy
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": SPLTorchPolicy.with_updates(optimizer_fn=sgd_optimizer_spl), "config_update": {
                    "learn_action": True,
                    "learn_reward": False,
                    "sgd_momentum": 0.75,
                    "explore": False,
                    "timesteps_per_iteration": hp["n_steps_per_epi"],
                    # === Optimization ===
                    # Learning rate for adam optimizer
                    "lr": hp["base_lr"] * hp["spl_lr_mul"],
                    # Learning rate schedule
                    "lr_schedule": [(0, hp["base_lr"] * hp["spl_lr_mul"]),
                                    (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
                    "loss_fn": torch.nn.CrossEntropyLoss(
                        weight=None,
                        size_average=None,
                        ignore_index=-100,
                        reduce=None,
                        reduction='mean')
                }},
            ],
        }
    )

    MyUncertainIPD = add_RewardUncertaintyEnvClassWrapper(
        IteratedPrisonersDilemma,
        reward_uncertainty_std=0.1)

    rllib_config = {
        "env": MyUncertainIPD,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "player_row": (
                    # The default policy is DQNTorchPolicy defined in DQNTrainer but we overwrite it to use the LTFT policy
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
                "player_col": (
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },

        # === DQN Models ===
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration": hp["n_steps_per_epi"],
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]),
        # Whether to use dueling dqn
        "dueling": False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [4],
        # Whether to use double dqn
        "double_q": True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay": False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": [4, 2],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },

        "gamma": 0.5,
        "min_iter_time_s": 0.33,
        "seed": tune.grid_search(hp["seeds"]),

        # === Optimization ===
        # Learning rate for adam optimizer
        "lr": hp["base_lr"],
        # Learning rate schedule
        "lr_schedule": [(0, hp["base_lr"]),
                        (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip": 1,
        # How many steps of the model to sample before learning starts.
        "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length": hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore": True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case, this is the name
            # (str) of any class present in the `rllib.utils.exploration` package.
            # You can also provide the python class directly or the full location
            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
            # EpsilonGreedy").
            "type": exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule": PiecewiseSchedule(
                endpoints=[
                    (0, 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.75), 0.1)],
                outside_value=0.1,
                framework="torch")
        },

        # General config
        "framework": "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LTFT supports only 1 worker only otherwise it would be mixing several opponents trajectories
        "num_workers": 0,
        # LTFT supports only 1 env per worker only otherwise several episodes would be played at the same time
        "num_envs_per_worker": 1,
        "batch_mode": "complete_episodes",

        # # === Debug Settings ===
        # # Whether to write episode stats and videos to the agent log dir. This is
        # # typically located in ~/ray_results.
        # "monitor": True,
        # # Set the ray.rllib.* log level for the agent process and its workers.
        # # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
        # # periodically print out summaries of relevant internal dataflow (this is
        # # also printed out once at startup at the INFO level). When using the
        # # `rllib train` command, you can also use the `-v` and `-vv` flags as
        # # shorthand for INFO and DEBUG.
        # "log_level": "INFO",
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        # "callbacks": DefaultCallbacks,
        "callbacks": miscellaneous.merge_callbacks(LTFTCallbacks,
                                                   log.get_logging_callbacks_class()),
        # # Whether to attempt to continue training if a worker crashes. The number
        # # of currently healthy workers is reported as the "num_healthy_workers"
        # # metric.
        # "ignore_worker_failures": False,
        # # Log system resource metrics to results. This requires `psutil` to be
        # # installed for sys stats, and `gputil` for GPU metrics.
        # "log_sys_usage": True,
        # # Use fake (infinite speed) sampler. For testing only.
        # "fake_sampler": False,
    }

    return rllib_config, env_config, stop
コード例 #4
0
def _get_rllib_config(hp: dict):
    stop = {
        "episodes_total": hp["n_epi"],
    }

    env_config = _get_env_config(hp)

    my_uncertain_env_class = add_RewardUncertaintyEnvClassWrapper(
        hp["env_class"], reward_uncertainty_std=hp["reward_uncertainty_std"])

    rllib_config = copy.deepcopy(ltft.DEFAULT_CONFIG)
    rllib_config.update({
        "env":
        my_uncertain_env_class,
        "env_config":
        env_config,
        "multiagent": {
            "policies": {
                env_config["players_ids"][0]: (
                    None,
                    hp["env_class"]({}).OBSERVATION_SPACE,
                    hp["env_class"].ACTION_SPACE,
                    {},
                ),
                env_config["players_ids"][1]: (
                    None,
                    hp["env_class"]({}).OBSERVATION_SPACE,
                    hp["env_class"].ACTION_SPACE,
                    {},
                ),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
            # When replay_mode=lockstep, RLlib will replay all the agent
            # transitions at a particular timestep together in a batch.
            # This allows the policy to implement differentiable shared
            # computations between agents it controls at that timestep. When
            # replay_mode=independent,
            # transitions are replayed independently per policy.
            # "replay_mode": "lockstep",
            "observation_fn": ltft.observation_fn,
        },
        # === DQN Models ===
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        30 * hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        max(int(hp["n_steps_per_epi"] * hp["n_epi"] * hp["buf_frac"]), 5),
        # Whether to use dueling dqn
        "dueling":
        False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens":
        hp["hiddens"],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": hp["hiddens"],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },
        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
            "type": exploration.SoftQScheduleWtClustering,
            # Add constructor kwargs here (if any).
            "temperature_schedule": hp["temperature_schedule"],
            "clustering_distance": hp["clustering_distance"],
        },
        "gamma":
        hp["gamma"],
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"] if hp["debug"] else int(hp["n_steps_per_epi"] *
                                                      hp["n_epi"] /
                                                      hp["log_n_points"]),
        "min_iter_time_s":
        0.0,
        "seed":
        tune.grid_search(hp["seeds"]),
        # === Optimization ===
        "optimizer": {
            "sgd_momentum": hp["sgd_momentum"],
        },
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule":
        hp["lr_schedule"],
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        "training_intensity":
        hp["training_intensity"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # General config
        "framework":
        "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus":
        int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LTFTTorchPolicy supports only 1 worker only otherwise
        # it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LTFTTorchPolicy supports only 1 env per worker only
        # otherwise several episodes would be played at the same
        # time
        "num_envs_per_worker":
        1,
        "batch_mode":
        "complete_episodes",
        "logger_config": {
            "wandb": {
                "project":
                "LTFT",
                "group":
                hp["exp_name"],
                "api_key_file":
                os.path.join(os.path.dirname(__file__),
                             "../../../api_key_wandb"),
                "log_config":
                True,
            },
        },
        # === Debug Settings ===
        "log_level":
        "INFO",
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and
        # `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        "callbacks":
        callbacks.merge_callbacks(
            ltft.LTFTCallbacks,
            log.get_logging_callbacks_class(log_full_epi=True, ),
        ),
    })

    hp, rllib_config, env_config, stop = _modify_config_for_coin_game(
        hp, rllib_config, env_config, stop)

    nested_policies_config = rllib_config["nested_policies"]
    nested_spl_policy_config = nested_policies_config[3]["config_update"]
    nested_spl_policy_config["train_batch_size"] = (int(
        hp["n_steps_per_epi"] * hp["bs_epi_mul_spl"]), )
    rllib_config["nested_policies"] = nested_policies_config

    return rllib_config, env_config, stop