Example #1
0
    def __init__(self, entropy_coeff, entropy_coeff_schedule):
        self._entropy_coeff_schedule = None
        if entropy_coeff_schedule is None:
            self.entropy_coeff = get_variable(entropy_coeff,
                                              framework="tf",
                                              tf_name="entropy_coeff",
                                              trainable=False)
        else:
            # Allows for custom schedule similar to lr_schedule format
            if isinstance(entropy_coeff_schedule, list):
                self._entropy_coeff_schedule = PiecewiseSchedule(
                    entropy_coeff_schedule,
                    outside_value=entropy_coeff_schedule[-1][-1],
                    framework=None)
            else:
                # Implements previous version but enforces outside_value
                self._entropy_coeff_schedule = PiecewiseSchedule(
                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
                    outside_value=0.0,
                    framework=None)

            self.entropy_coeff = get_variable(
                self._entropy_coeff_schedule.value(0),
                framework="tf",
                tf_name="entropy_coeff",
                trainable=False)
            if self.framework == "tf":
                self._entropy_coeff_placeholder = tf1.placeholder(
                    dtype=tf.float32, name="entropy_coeff")
                self._entropy_coeff_update = self.entropy_coeff.assign(
                    self._entropy_coeff_placeholder, read_value=False)
Example #2
0
    def __init__(self,
                 action_space,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 epsilon_schedule=None,
                 framework="tf",
                 **kwargs):
        """Create an EpsilonGreedy exploration class.

        Args:
            action_space (Space): The gym action space used by the environment.
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule(
            endpoints=[(0, initial_epsilon),
                       (epsilon_timesteps, final_epsilon)],
            outside_value=final_epsilon,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
Example #3
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr, framework=None)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
def modify_config_for_evaluation(config_eval, hp, env_config):
    config_eval["explore"] = False
    config_eval["seed"] = None
    policies = config_eval["multiagent"]["policies"]
    for policy_id in policies.keys():
        policy_config = policies[policy_id][3]
        policy_config["working_state"] = "eval_amtft"
    if not hp["self_play"]:
        naive_player_id = env_config["players_ids"][-1]
        naive_player_policy_config = policies[naive_player_id][3]
        naive_player_policy_config["working_state"] = "eval_naive_selfish"

    if hp["explore_during_evaluation"]:
        tmp_mul = 1.0
        config_eval["explore"] = (miscellaneous.OVERWRITE_KEY, True)
        config_eval["exploration_config"] = {
            "type":
            config_eval["exploration_config"]["type"],
            "temperature_schedule":
            PiecewiseSchedule(
                endpoints=[
                    (0, tmp_mul * hp["last_exploration_temp_value"]),
                    (0, tmp_mul * hp["last_exploration_temp_value"]),
                ],
                outside_value=tmp_mul * hp["last_exploration_temp_value"],
                framework="torch",
            ),
        }

    if hp["debug"] and hp.get("debit_threshold_debug_override", True):
        for policy_id in policies.keys():
            policies[policy_id][3]["debit_threshold"] = 0.5
            policies[policy_id][3]["last_k"] = hp["n_steps_per_epi"] - 1

    return config_eval
Example #5
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 model: ModelV2,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 epsilon_schedule=None,
                 **kwargs):
        """Initializes a StochasticSampling Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            framework (str): One of None, "tf", "torch".
        """
        assert framework is not None
        super().__init__(action_space,
                         model=model,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = \
            from_config(Schedule, epsilon_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon), (epsilon_timesteps, final_epsilon)],
                outside_value=final_epsilon,
                framework=self.framework)

        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
Example #6
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = lr
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr, framework=None)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
Example #7
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.get_variable("lr", initializer=lr)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1])
Example #8
0
    def __init__(self, entropy_coeff, entropy_coeff_schedule):
        self.entropy_coeff = entropy_coeff

        if entropy_coeff_schedule is None:
            self.entropy_coeff_schedule = ConstantSchedule(entropy_coeff)
        else:
            # Allows for custom schedule similar to lr_schedule format
            if isinstance(entropy_coeff_schedule, list):
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    entropy_coeff_schedule,
                    outside_value=entropy_coeff_schedule[-1][-1])
            else:
                # Implements previous version but enforces outside_value
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
                    outside_value=0.0)
Example #9
0
    def __init__(self,
                 action_space,
                 *,
                 framework,
                 initial_temperature=1.0,
                 final_temperature=0.0,
                 temperature_timesteps=int(1e5),
                 temperature_schedule=None,
                 **kwargs):
        """Initializes a SoftQ Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            temperature (Schedule): The temperature to divide model outputs by
                before creating the Categorical distribution to sample from.
            framework (str): One of None, "tf", "torch".
            temperature_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert isinstance(action_space, Discrete)
        super().__init__(action_space, framework=framework, **kwargs)

        self.temperature_schedule = \
            from_config(Schedule, temperature_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_temperature), (temperature_timesteps, final_temperature)],
                outside_value=final_temperature,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
        self.temperature = self.temperature_schedule(self.last_timestep)
Example #10
0
 def __init__(self, lr, lr_schedule):
     self._lr_schedule = None
     if lr_schedule is None:
         self.cur_lr = lr
     else:
         self._lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
         self.cur_lr = self._lr_schedule.value(0)
Example #11
0
    def __init__(
        self,
        action_space: gym.spaces.Space,
        *,
        framework: str,
        initial_epsilon: float = 1.0,
        final_epsilon: float = 0.05,
        warmup_timesteps: int = 0,
        epsilon_timesteps: int = int(1e5),
        epsilon_schedule: Optional[Schedule] = None,
        **kwargs,
    ):
        """Create an EpsilonGreedy exploration class.

        Args:
            action_space: The action space the exploration should occur in.
            framework: The framework specifier.
            initial_epsilon: The initial epsilon value to use.
            final_epsilon: The final epsilon value to use.
            warmup_timesteps: The timesteps over which to not change epsilon in the
                beginning.
            epsilon_timesteps: The timesteps (additional to `warmup_timesteps`)
                after which epsilon should always be `final_epsilon`.
                E.g.: warmup_timesteps=20k epsilon_timesteps=50k -> After 70k timesteps,
                epsilon will reach its final value.
            epsilon_schedule: An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = from_config(
            Schedule, epsilon_schedule,
            framework=framework) or PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon),
                    (warmup_timesteps, initial_epsilon),
                    (warmup_timesteps + epsilon_timesteps, final_epsilon),
                ],
                outside_value=final_epsilon,
                framework=self.framework,
            )

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(
            np.array(0, np.int64),
            framework=framework,
            tf_name="timestep",
            dtype=np.int64,
        )

        # Build the tf-info-op.
        if self.framework == "tf":
            self._tf_state_op = self.get_state()
Example #12
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False)
     self._lr_schedule = lr_schedule
     if self._lr_schedule is not None:
         self._lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
         if self.framework == "tf":
             self._lr_placeholder = tf1.placeholder(dtype=tf.float32,
                                                    name="lr")
             self._lr_update = self.cur_lr.assign(self._lr_placeholder,
                                                  read_value=False)
Example #13
0
    def __init__(self, entropy_coeff, entropy_coeff_schedule):
        self.entropy_coeff = tf.get_variable(
            "entropy_coeff", initializer=entropy_coeff, trainable=False)

        if entropy_coeff_schedule is None:
            self.entropy_coeff_schedule = ConstantSchedule(
                entropy_coeff, framework=None)
        else:
            # Allows for custom schedule similar to lr_schedule format
            if isinstance(entropy_coeff_schedule, list):
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    entropy_coeff_schedule,
                    outside_value=entropy_coeff_schedule[-1][-1],
                    framework=None)
            else:
                # Implements previous version but enforces outside_value
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
                    outside_value=0.0,
                    framework=None)
 def arrange_for_multi_step_wt_coin_game(self):
     self.initial_temperature = 0.0
     self.final_temperature = 0.0
     self.temperature_timesteps = 0.0
     self.temperature_schedule = PiecewiseSchedule(
         endpoints=[
             (0, 2.0),
             (1000, 0.5),
             (2000, 0.1)],
         outside_value=0.1,
         framework="torch")
     self.init_coin_game_scheduler()
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.Variable(lr, name="lr", trainable=False)
     # self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr, framework=None)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule,
             interpolation=_left_constant_interpolation,
             outside_value=lr_schedule[-1][-1],
             framework=None,
         )
Example #16
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr)
     elif isinstance(lr_schedule, list):
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1])
     elif isinstance(lr_schedule, dict):
         self.lr_schedule = LinearSchedule(
             schedule_timesteps=lr_schedule["schedule_timesteps"],
             initial_p=lr,
             final_p=lr_schedule["final_lr"])
     else:
         raise ValueError('lr_schedule must be either list, dict or None')
Example #17
0
def modify_hyperparams_for_the_selected_env(hp):
    if "IPD" in hp["env"].NAME:
        hp["n_epi"] = 10 if hp["debug"] else 400
        hp["base_lr"] = 0.01
        hp["x_limits"] = (-3.5, 0.5)
        hp["y_limits"] = (-3.5, 0.5)
    elif "IteratedChicken" in hp["env"].NAME:
        hp["n_epi"] = 10 if hp["debug"] else 400
        hp["debit_threshold"] = 2.0
        hp["x_limits"] = (-11.0, 4.5)
        hp["y_limits"] = (-11.0, 4.5)
        hp["use_adam"] = True
        if hp["use_adam"]:
            hp["base_lr"] = 0.04
        else:
            hp["base_lr"] = 0.01 / 5
    elif "IteratedBoS" in hp["env"].NAME:
        hp["n_epi"] = 10 if hp["debug"] else 800
        hp["base_lr"] = 0.01
        hp["x_limits"] = (0.0, 4.0)
        hp["y_limits"] = (0.0, 4.0)
    elif "CoinGame" in hp["env"].NAME:
        hp["n_epi"] = 10 if hp["debug"] else 4000
        hp["base_lr"] = 0.1
        hp["x_limits"] = (-1.0, 3.0)
        hp["y_limits"] = (-1.0, 1.0)
        hp["gamma"] = 0.9
        hp["lambda"] = 0.9
        hp["alpha"] = 0.0
        hp["beta"] = 0.5
        hp["temperature_schedule"] = PiecewiseSchedule(endpoints=[
            (0, 2.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.50), 0.1)
        ],
                                                       outside_value=0.1,
                                                       framework="torch")
        hp["debit_threshold"] = 2.0
        hp["jitter"] = 0.02
    else:
        raise NotImplementedError(f'hp["env"]: {hp["env"]}')

    hp["plot_axis_scale_multipliers"] = (
        (1 / hp["n_steps_per_epi"]),  # for x axis
        (1 / hp["n_steps_per_epi"]))  # for y axis

    return hp
Example #18
0
    def __init__(self,
                 action_space,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 num_workers=None,
                 worker_index=None,
                 epsilon_schedule=None,
                 framework="tf"):
        """

        Args:
            action_space (Space): The gym action space used by the environment.
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            num_workers (Optional[int]): The overall number of workers used.
            worker_index (Optional[int]): The index of the Worker using this
                Exploration.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
        # For now, require Discrete action space (may loosen this restriction
        # in the future).
        assert isinstance(action_space, gym.spaces.Discrete)
        assert framework is not None
        super().__init__(action_space=action_space,
                         num_workers=num_workers,
                         worker_index=worker_index,
                         framework=framework)

        self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule(
            endpoints=[(0, initial_epsilon),
                       (epsilon_timesteps, final_epsilon)],
            outside_value=final_epsilon,
            framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
Example #19
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 epsilon_schedule=None,
                 **kwargs):
        """Create an EpsilonGreedy exploration class.

        Args:
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = \
            from_config(Schedule, epsilon_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon), (epsilon_timesteps, final_epsilon)],
                outside_value=final_epsilon,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")

        # Build the tf-info-op.
        if self.framework == "tf":
            raise ValueError("Torch version does not support "
                             "multiobj episilon-greedy yet!")
Example #20
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 initial_epsilon: float = 1.0,
                 final_epsilon: float = 0.05,
                 epsilon_timesteps: int = int(1e5),
                 epsilon_schedule: Optional[Schedule] = None,
                 **kwargs):
        """Create an EpsilonGreedy exploration class.

        Args:
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = \
            from_config(Schedule, epsilon_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon), (epsilon_timesteps, final_epsilon)],
                outside_value=final_epsilon,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(np.array(0, np.int64),
                                          framework=framework,
                                          tf_name="timestep",
                                          dtype=np.int64)

        # Build the tf-info-op.
        if self.framework in ["tf2", "tf", "tfe"]:
            self._tf_info_op = self.get_info()
Example #21
0
def make_exploration_schedule(config, worker_index):
    # Use either a different `eps` per worker, or a linear schedule.
    if config["per_worker_exploration"]:
        assert config["num_workers"] > 1, \
            "This requires multiple workers"
        if worker_index >= 0:
            # Exploration constants from the Ape-X paper
            exponent = (
                1 + worker_index / float(config["num_workers"] - 1) * 7)
            return ConstantSchedule(0.4**exponent)
        else:
            # local ev should have zero exploration so that eval rollouts
            # run properly
            return ConstantSchedule(0.0)

    return PiecewiseSchedule(
        endpoints=[
            (0, config["exploration_initial_eps"]),
            (int(config["exploration_fraction"] *
                 config["schedule_max_timesteps"]),
             config["exploration_final_eps"]),
        ],
        outside_value=config["exploration_final_eps"])
Example #22
0
def make_exploration_schedule(config, worker_index):
    # Modification of DQN's schedule to take into account
    # `exploration_ou_noise_scale`
    if config["per_worker_exploration"]:
        assert config["num_workers"] > 1, "This requires multiple workers"
        if worker_index >= 0:
            # FIXME: what do magic constants mean? (0.4, 7)
            max_index = float(config["num_workers"] - 1)
            exponent = 1 + worker_index / max_index * 7
            return ConstantSchedule(0.4**exponent)
        else:
            # local ev should have zero exploration so that eval rollouts
            # run properly
            return ConstantSchedule(0.0)
    elif config["exploration_should_anneal"]:
        return PiecewiseSchedule(
            endpoints=[(0, 1.0), (int(config["exploration_fraction"] *
                                      config["schedule_max_timesteps"]),
                                  config["exploration_final_scale"])],
            outside_value=config["exploration_final_scale"])
    else:
        # *always* add exploration noise
        return ConstantSchedule(1.0)
    def __init__(
        self,
        workers,
        learning_starts=1000,
        buffer_size=10000,
        prioritized_replay=True,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta=0.4,
        prioritized_replay_eps=1e-6,
        final_prioritized_replay_beta=0.4,
        train_batch_size=32,
        before_learn_on_batch=None,
        synchronize_sampling=False,
        prioritized_replay_beta_annealing_timesteps=100000 * 0.2,
    ):
        """Initialize an sync replay optimizer.

        Args:
            workers (WorkerSet): all workers
            learning_starts (int): wait until this many steps have been sampled
                before starting optimization.
            buffer_size (int): max size of the replay buffer
            prioritized_replay (bool): whether to enable prioritized replay
            prioritized_replay_alpha (float): replay alpha hyperparameter
            prioritized_replay_beta (float): replay beta hyperparameter
            prioritized_replay_eps (float): replay eps hyperparameter
            final_prioritized_replay_beta (float): Final value of beta.
            train_batch_size (int): size of batches to learn on
            before_learn_on_batch (function): callback to run before passing
                the sampled batch to learn on
            synchronize_sampling (bool): whether to sample the experiences for
                all policies with the same indices (used in MADDPG).
            prioritized_replay_beta_annealing_timesteps (int): The timestep at
                which PR-beta annealing should end.
        """
        PolicyOptimizer.__init__(self, workers)

        self.replay_starts = learning_starts

        # Linearly annealing beta used in Rainbow paper, stopping at
        # `final_prioritized_replay_beta`.
        self.prioritized_replay_beta = PiecewiseSchedule(
            endpoints=[(0, prioritized_replay_beta),
                       (prioritized_replay_beta_annealing_timesteps,
                        final_prioritized_replay_beta)],
            outside_value=final_prioritized_replay_beta,
            framework=None)
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size
        self.before_learn_on_batch = before_learn_on_batch
        self.synchronize_sampling = synchronize_sampling

        # Stats
        self.update_weights_timer = TimerStat()
        self.sample_timer = TimerStat()
        self.replay_timer = TimerStat()
        self.grad_timer = TimerStat()
        self.learner_stats = {}

        # Set up replay buffer
        if prioritized_replay:

            def new_buffer():
                return PrioritizedReplayBuffer(buffer_size,
                                               alpha=prioritized_replay_alpha)
        else:

            def new_buffer():
                return ReplayBuffer(buffer_size)

        self.replay_buffers = collections.defaultdict(new_buffer)

        if buffer_size < self.replay_starts:
            logger.warning("buffer_size={} < replay_starts={}".format(
                buffer_size, self.replay_starts))
Example #24
0
def get_rllib_config(hp: dict):
    stop = {
        "episodes_total": hp["n_epi"],  # 4000 steps in 200 epi
    }

    env_config = {
        "players_ids": ["player_row", "player_col"],
        "max_steps": hp["n_steps_per_epi"],
    }

    MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
        optimizer_fn=sgd_optimizer_dqn,
        stats_fn=log.stats_fn_wt_additionnal_logs(build_q_stats))

    ltft_config = merge_dicts(
        LTFT_DEFAULT_CONFIG_UPDATE,
        {
            "sgd_momentum": 0.9,
            'nested_policies': [
                # Here the trainer need to be a DQNTrainer to provide the config for the 3 DQNTorchPolicy
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": MyDQNTorchPolicy, "config_update": {}},
                {"Policy_class": SPLTorchPolicy.with_updates(optimizer_fn=sgd_optimizer_spl), "config_update": {
                    "learn_action": True,
                    "learn_reward": False,
                    "sgd_momentum": 0.75,
                    "explore": False,
                    "timesteps_per_iteration": hp["n_steps_per_epi"],
                    # === Optimization ===
                    # Learning rate for adam optimizer
                    "lr": hp["base_lr"] * hp["spl_lr_mul"],
                    # Learning rate schedule
                    "lr_schedule": [(0, hp["base_lr"] * hp["spl_lr_mul"]),
                                    (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
                    "loss_fn": torch.nn.CrossEntropyLoss(
                        weight=None,
                        size_average=None,
                        ignore_index=-100,
                        reduce=None,
                        reduction='mean')
                }},
            ],
        }
    )

    MyUncertainIPD = add_RewardUncertaintyEnvClassWrapper(
        IteratedPrisonersDilemma,
        reward_uncertainty_std=0.1)

    rllib_config = {
        "env": MyUncertainIPD,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "player_row": (
                    # The default policy is DQNTorchPolicy defined in DQNTrainer but we overwrite it to use the LTFT policy
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
                "player_col": (
                    LTFT,
                    IteratedPrisonersDilemma.OBSERVATION_SPACE,
                    IteratedPrisonersDilemma.ACTION_SPACE,
                    copy.deepcopy(ltft_config)),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },

        # === DQN Models ===
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration": hp["n_steps_per_epi"],
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size": int(hp["n_steps_per_epi"] * hp["n_epi"]),
        # Whether to use dueling dqn
        "dueling": False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [4],
        # Whether to use double dqn
        "double_q": True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay": False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": [4, 2],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },

        "gamma": 0.5,
        "min_iter_time_s": 0.33,
        "seed": tune.grid_search(hp["seeds"]),

        # === Optimization ===
        # Learning rate for adam optimizer
        "lr": hp["base_lr"],
        # Learning rate schedule
        "lr_schedule": [(0, hp["base_lr"]),
                        (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip": 1,
        # How many steps of the model to sample before learning starts.
        "learning_starts": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length": hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size": int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore": True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case, this is the name
            # (str) of any class present in the `rllib.utils.exploration` package.
            # You can also provide the python class directly or the full location
            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
            # EpsilonGreedy").
            "type": exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule": PiecewiseSchedule(
                endpoints=[
                    (0, 1.0), (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.75), 0.1)],
                outside_value=0.1,
                framework="torch")
        },

        # General config
        "framework": "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LTFT supports only 1 worker only otherwise it would be mixing several opponents trajectories
        "num_workers": 0,
        # LTFT supports only 1 env per worker only otherwise several episodes would be played at the same time
        "num_envs_per_worker": 1,
        "batch_mode": "complete_episodes",

        # # === Debug Settings ===
        # # Whether to write episode stats and videos to the agent log dir. This is
        # # typically located in ~/ray_results.
        # "monitor": True,
        # # Set the ray.rllib.* log level for the agent process and its workers.
        # # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
        # # periodically print out summaries of relevant internal dataflow (this is
        # # also printed out once at startup at the INFO level). When using the
        # # `rllib train` command, you can also use the `-v` and `-vv` flags as
        # # shorthand for INFO and DEBUG.
        # "log_level": "INFO",
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        # "callbacks": DefaultCallbacks,
        "callbacks": miscellaneous.merge_callbacks(LTFTCallbacks,
                                                   log.get_logging_callbacks_class()),
        # # Whether to attempt to continue training if a worker crashes. The number
        # # of currently healthy workers is reported as the "num_healthy_workers"
        # # metric.
        # "ignore_worker_failures": False,
        # # Log system resource metrics to results. This requires `psutil` to be
        # # installed for sys stats, and `gputil` for GPU metrics.
        # "log_sys_usage": True,
        # # Use fake (infinite speed) sampler. For testing only.
        # "fake_sampler": False,
    }

    return rllib_config, env_config, stop
def modify_hyperparams_for_the_selected_env(hp):
    hp["plot_keys"] = (amTFT.PLOT_KEYS +
                       aggregate_and_plot_tensorboard_data.PLOT_KEYS)
    hp["plot_assemblage_tags"] = (
        amTFT.PLOT_ASSEMBLAGE_TAGS +
        aggregate_and_plot_tensorboard_data.PLOT_ASSEMBLAGE_TAGS)
    mul_temp = 1.0

    hp["punishment_multiplier"] = 3.0
    hp["buf_frac"] = 0.125
    hp["training_intensity"] = 10
    # hp["rollout_length"] = 40
    # hp["n_rollout_replicas"] = 20
    hp["rollout_length"] = 4
    hp["n_rollout_replicas"] = 5

    if "CoinGame" in hp["env_name"]:
        hp["plot_keys"] += vectorized_coin_game.PLOT_KEYS
        hp["plot_assemblage_tags"] += vectorized_coin_game.PLOT_ASSEMBLAGE_TAGS

        hp["n_steps_per_epi"] = 20 if hp["debug"] else 100
        hp["n_epi"] = 10 if hp["debug"] else 4000
        hp["base_lr"] = 0.1
        hp["bs_epi_mul"] = 1
        hp["both_players_can_pick_the_same_coin"] = False
        hp["sgd_momentum"] = 0.9

        hp["lambda"] = 0.96
        hp["alpha"] = 0.0
        hp["beta"] = 0.5

        hp["debit_threshold"] = 30.0
        hp["jitter"] = 0.02
        hp["filter_utilitarian"] = False

        hp["target_network_update_freq"] = 100 * hp["n_steps_per_epi"]
        hp["last_exploration_temp_value"] = 0.03 * mul_temp

        hp["temperature_schedule"] = PiecewiseSchedule(
            endpoints=[
                (0, 2.0 * mul_temp),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.20),
                    0.5 * mul_temp,
                ),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.60),
                    hp["last_exploration_temp_value"],
                ),
            ],
            outside_value=hp["last_exploration_temp_value"],
            framework="torch",
        )

        if "AsymCoinGame" in hp["env_name"]:
            hp["x_limits"] = (-0.5, 3.0)
            hp["y_limits"] = (-1.1, 0.6)
            hp["env_class"] = vectorized_coin_game.AsymVectorizedCoinGame
        elif "MixedMotiveCoinGame" in hp["env_name"]:
            if "SSDMixedMotiveCoinGame" in hp["env_name"]:
                hp["debit_threshold"] = 3.0
                hp["x_limits"] = (-0.25, 1.0)
                hp["y_limits"] = (-0.25, 1.5)
                hp["env_class"] = ssd_mixed_motive_coin_game.SSDMixedMotiveCoinGame
            else:
                hp["x_limits"] = (-2.0, 2.0)
                hp["y_limits"] = (-0.5, 3.0)
                hp["env_class"] = vectorized_mixed_motive_coin_game.VectMixedMotiveCG
            hp["both_players_can_pick_the_same_coin"] = True
        else:
            hp["x_limits"] = (-0.5, 0.6)
            hp["y_limits"] = (-0.5, 0.6)
            hp["env_class"] = vectorized_coin_game.VectorizedCoinGame
    else:

        hp["plot_keys"] += matrix_sequential_social_dilemma.PLOT_KEYS
        hp["plot_assemblage_tags"] += matrix_sequential_social_dilemma.PLOT_ASSEMBLAGE_TAGS

        hp["base_lr"] = 0.03
        hp["bs_epi_mul"] = 1
        hp["n_steps_per_epi"] = 20
        hp["n_epi"] = 10 if hp["debug"] else 800
        hp["lambda"] = 0.96
        hp["alpha"] = 0.0
        hp["beta"] = 1.0
        hp["sgd_momentum"] = 0.0

        hp["debit_threshold"] = 10.0

        hp["target_network_update_freq"] = 30 * hp["n_steps_per_epi"]
        hp["last_exploration_temp_value"] = 0.1 * mul_temp

        hp["temperature_schedule"] = PiecewiseSchedule(
            endpoints=[
                (0, 2.0 * mul_temp),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33),
                    0.5 * mul_temp,
                ),
                (
                    int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66),
                    hp["last_exploration_temp_value"],
                ),
            ],
            outside_value=hp["last_exploration_temp_value"],
            framework="torch",
        )

        if "IteratedPrisonersDilemma" in hp["env_name"]:
            hp["filter_utilitarian"] = False
            hp["x_limits"] = (-3.5, 0.5)
            hp["y_limits"] = (-3.5, 0.5)
            hp["utilitarian_filtering_threshold"] = -2.5
            hp["env_class"] = matrix_sequential_social_dilemma.IteratedPrisonersDilemma
        elif "IteratedAsymBoS" in hp["env_name"]:
            hp["x_limits"] = (-0.1, 4.1)
            hp["y_limits"] = (-0.1, 4.1)
            hp["utilitarian_filtering_threshold"] = 3.2
            hp["env_class"] = matrix_sequential_social_dilemma.IteratedAsymBoS
        else:
            raise NotImplementedError(f'hp["env_name"]: {hp["env_name"]}')

    hp["lr_schedule"] = [
        (0, 0.0),
        (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.05), hp["base_lr"]),
        (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9),
    ]

    hp["plot_axis_scale_multipliers"] = (
        (1 / hp["n_steps_per_epi"]),  # for x axis
        (1 / hp["n_steps_per_epi"]),
    )  # for y axis

    hp["env_class"] = add_RewardUncertaintyEnvClassWrapper(
        env_class=hp["env_class"],
        reward_uncertainty_std=hp["reward_uncertainty"],
    )

    return hp
Example #26
0
def get_rllib_config(hp, welfare_fn):
    stop = {
        "episodes_total": hp["n_epi"],
    }

    env_config = get_env_config(hp)
    policies = get_policies(hp, env_config, welfare_fn)

    selected_seeds = hp["seeds"][:hp["train_n_replicates"]]
    hp["seeds"] = hp["seeds"][hp["train_n_replicates"]:]

    trainer_config_update = {
        "env":
        hp["env"],
        "env_config":
        env_config,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        "gamma":
        hp["gamma"],
        "min_iter_time_s":
        hp["min_iter_time_s"],
        "seed":
        tune.grid_search(selected_seeds),

        # === Optimization ===
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule":
        [(0, hp["base_lr"]),
         (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9)],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"],

        # General config
        "framework":
        "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus":
        int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LE supports only 1 worker only otherwise it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LE supports only 1 env per worker only otherwise several episodes would be played at the same time
        "num_envs_per_worker":
        1,

        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        "callbacks":
        amTFT.get_amTFTCallBacks(additionnal_callbacks=[
            log.get_logging_callbacks_class(),
            # This only overwrite the reward that is used for training not the one in the metrics
            postprocessing.OverwriteRewardWtWelfareCallback
        ]),
        # "log_level": "INFO",
    }

    trainer_config_update.update({
        # === DQN Models ===
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4,
        # Whether to use dueling dqn
        "dueling":
        False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens":
        hp["hiddens"],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": hp["hiddens"],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },

        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case, this is the name
            # (str) of any class present in the `rllib.utils.exploration` package.
            # You can also provide the python class directly or the full location
            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
            # EpsilonGreedy").
            "type":
            exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule":
            hp["temperature_schedule"] or PiecewiseSchedule(endpoints=[
                (0, 10.0),
                (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0),
                (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1)
            ],
                                                            outside_value=0.1,
                                                            framework="torch"),
        },
    })

    if hp["env"] in [coin_game.CoinGame, coin_game.AsymCoinGame]:
        trainer_config_update["model"] = {
            "dim": env_config["grid_size"],
            "conv_filters": [[16, [3, 3], 1],
                             [32, [3, 3],
                              1]],  # [Channel, [Kernel, Kernel], Stride]]
        }

    return stop, env_config, trainer_config_update
Example #27
0
def get_rllib_config(hp: dict, lvl1_idx: list, lvl1_training: bool):
    assert lvl1_training

    tune_config, _, env_config = get_tune_config(hp=hp)
    tune_config["seed"] = 2020

    stop = {"episodes_total": hp["n_epi"]}

    after_init_fn = functools.partial(
        miscellaneous.sequence_of_fn_wt_same_args,
        function_list=[restore.after_init_load_policy_checkpoint, after_init],
    )

    def sgd_optimizer_dqn(policy, config) -> "torch.optim.Optimizer":
        return torch.optim.SGD(
            policy.q_func_vars,
            lr=policy.cur_lr,
            momentum=config["sgd_momentum"],
        )

    MyDQNTorchPolicy = DQNTorchPolicy.with_updates(
        stats_fn=log.augment_stats_fn_wt_additionnal_logs(build_q_stats),
        optimizer_fn=sgd_optimizer_dqn,
        after_init=after_init_fn,
    )

    if tune_config["env_class"] in (
            IteratedPrisonersDilemma,
            IteratedBoS,
            IteratedAsymChicken,
            IteratedAsymBoS,
    ):
        env_config.update({
            "max_steps": hp["n_steps_per_epi"],
        })

    elif tune_config["env_class"] in (
            VectorizedCoinGame,
            AsymVectorizedCoinGame,
    ):
        env_config.update({
            "max_steps": hp["n_steps_per_epi"],
            "batch_size": 1,
        })

    else:
        raise ValueError()

    tune_config["TuneTrainerClass"] = hp["tune_class"]
    tune_config["TuneTrainerClass"] = hp["tune_class"]
    tune_config["env_config"] = env_config
    policies = {}
    for policy_idx, policy_id in enumerate(env_config["players_ids"]):
        if policy_idx not in lvl1_idx:
            policies[policy_id] = (
                policy.get_tune_policy_class(DQNTorchPolicy),
                tune_config["env_class"](env_config).OBSERVATION_SPACE,
                tune_config["env_class"].ACTION_SPACE,
                {
                    "sgd_momentum": hp["sgd_momentum"],
                    "tune_config": tune_config,
                },
            )
        else:
            policies[policy_id] = (
                MyDQNTorchPolicy,
                tune_config["env_class"](env_config).OBSERVATION_SPACE,
                tune_config["env_class"].ACTION_SPACE,
                {
                    "sgd_momentum": hp["sgd_momentum"]
                },
            )

    rllib_config = {
        "env":
        tune_config["env_class"],
        "env_config":
        env_config,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        # === DQN Models ===
        # Minimum env steps to optimize for per train call. This value does
        # not affect learning, only the length of iterations.
        "timesteps_per_iteration":
        hp["n_steps_per_epi"],
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq":
        hp["n_steps_per_epi"],
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size":
        int(hp["n_steps_per_epi"] * hp["n_epi"]) // 4,
        # Whether to use dueling dqn
        "dueling":
        False,
        # Dense-layer setup for each the advantage branch and the value branch
        # in a dueling architecture.
        "hiddens": [64],
        # Whether to use double dqn
        "double_q":
        True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay":
        False,
        "model": {
            # Number of hidden layers for fully connected net
            "fcnet_hiddens": [64],
            # Nonlinearity for fully connected net (tanh, relu)
            "fcnet_activation": "relu",
        },
        "gamma":
        hp["gamma"],
        "min_iter_time_s":
        3.0,
        # Can't restaure stuff with search
        # "seed": hp["seed"],
        "seed":
        tune.grid_search(
            hp["lvl1_seeds"] if lvl1_training else hp["lvl0_seeds"]),
        # "evaluation_num_episodes": 100,
        # "evaluation_interval": hparams["n_epi"],
        # === Optimization ===
        # Learning rate for adam optimizer
        "lr":
        hp["base_lr"],
        # Learning rate schedule
        "lr_schedule": [
            (0, hp["base_lr"]),
            (int(hp["n_steps_per_epi"] * hp["n_epi"]), hp["base_lr"] / 1e9),
        ],
        # Adam epsilon hyper parameter
        # "adam_epsilon": 1e-8,
        # If not None, clip gradients during optimization at this value
        "grad_clip":
        1,
        # How many steps of the model to sample before learning starts.
        "learning_starts":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.
        "rollout_fragment_length":
        hp["n_steps_per_epi"],
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size":
        int(hp["n_steps_per_epi"] * hp["bs_epi_mul"]),
        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore":
        True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
            "type":
            exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule":
            hp["temperature_schedule"] or PiecewiseSchedule(
                endpoints=[
                    (0, 10.0),
                    (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.33), 1.0),
                    (int(hp["n_steps_per_epi"] * hp["n_epi"] * 0.66), 0.1),
                ],
                outside_value=0.1,
                framework="torch",
            ),
        },
        # General config
        "framework":
        "torch",
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus":
        int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        # LE supports only 1 worker only
        # otherwise it would be mixing several opponents trajectories
        "num_workers":
        0,
        # LE supports only 1 env per worker
        # only otherwise several episodes would be played at the same time
        "num_envs_per_worker":
        1,
        # Callbacks that will be run during various phases of training. See the
        # `DefaultCallbacks` class and
        # `examples/custom_metrics_and_callbacks.py`
        # for more usage information.
        "callbacks":
        callbacks.merge_callbacks(
            log.get_logging_callbacks_class(), callbacks.PolicyCallbacks
            # population.PopulationOfIdenticalAlgoCallBacks
        ),
        "log_level":
        "INFO",
    }

    if "CoinGame" in hp["env_name"]:
        rllib_config["model"] = {
            "dim": env_config["grid_size"],
            # [Channel, [Kernel, Kernel], Stride]]
            "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]],
        }

    return stop, env_config, rllib_config
Example #28
0
def train_lvl1_agents(tune_hp, rllib_hp, results_list_lvl0):
    lvl0_policy_idx = 1
    lvl1_policy_idx = 0

    if tune_hp["env_name"] == "IteratedPrisonersDilemma":
        rllib_hp["n_epi"] = 3 if rllib_hp["debug"] else 400
        rllib_hp["base_lr"] = 0.04
        rllib_hp["x_limits"] = ((-3.5, 0.5), )
        rllib_hp["y_limits"] = ((-3.5, 0.5), )
    elif tune_hp["env_name"] == "IteratedAsymChicken":
        rllib_hp["n_epi"] = 3 if rllib_hp["debug"] else 400
        rllib_hp["base_lr"] = 0.04
        rllib_hp["x_limits"] = ((-11.0, 4.0), )
        rllib_hp["y_limits"] = ((-11.0, 4.0), )
    elif tune_hp["env_name"] in ("IteratedBoS", "IteratedAsymBoS"):
        rllib_hp["n_epi"] = 3 if rllib_hp["debug"] else 800
        rllib_hp["base_lr"] = 0.01
        rllib_hp["x_limits"] = ((-0.5, 4.5), )
        rllib_hp["y_limits"] = ((-0.5, 4.5), )
        rllib_hp["temperature_schedule"] = PiecewiseSchedule(
            endpoints=[
                (0, 10.0),
                (
                    int(tune_hp["n_steps_per_epi"] * tune_hp["n_epi"] * 0.33),
                    2.0,
                ),
                (
                    int(tune_hp["n_steps_per_epi"] * tune_hp["n_epi"] * 0.66),
                    0.1,
                ),
            ],
            outside_value=0.1,
            framework="torch",
        )
    elif "CoinGame" in tune_hp["env_name"]:
        rllib_hp["n_epi"] = 3 if rllib_hp["debug"] else 4000
        rllib_hp["base_lr"] = 0.1
        rllib_hp["x_limits"] = ((-1.0, 3.0), )
        rllib_hp["y_limits"] = ((-1.0, 1.0), )
        rllib_hp["temperature_schedule"] = PiecewiseSchedule(
            endpoints=[
                (0, 2.0),
                (
                    int(rllib_hp["n_steps_per_epi"] * rllib_hp["n_epi"] *
                        0.50),
                    0.1,
                ),
            ],
            outside_value=0.1,
            framework="torch",
        )
        rllib_hp["jitter"] = 0.02
    else:
        raise NotImplementedError(f'rllib_hp["env"]: {rllib_hp["env"]}')

    tune_hp.update(rllib_hp)
    stop, env_config, rllib_config = get_rllib_config(
        tune_hp, lvl1_idx=[lvl1_policy_idx], lvl1_training=True)

    if tune_hp["load_population"] is None:
        lvl0_checkpoints = miscellaneous.extract_checkpoints(results_list_lvl0)
    else:
        lvl0_checkpoints = tune_hp["load_population"]
    lvl0_policy_id = env_config["players_ids"][lvl0_policy_idx]
    lvl1_policy_id = env_config["players_ids"][lvl1_policy_idx]

    l1br_configuration_helper = lvl1_best_response.L1BRConfigurationHelper(
        rllib_config, lvl0_policy_id, lvl1_policy_id)
    l1br_configuration_helper.define_exp(
        use_n_lvl0_agents_in_each_population=len(tune_hp["lvl0_seeds"]) //
        len(tune_hp["lvl1_seeds"]),
        train_n_lvl1_agents=len(tune_hp["lvl1_seeds"]),
        lvl0_checkpoints=lvl0_checkpoints,
    )
    rllib_config = l1br_configuration_helper.prepare_config_for_lvl1_training()

    results = ray.tune.run(
        DQNTrainer,
        config=rllib_config,
        stop=stop,
        name=tune_hp["exp_name"],
        checkpoint_at_end=True,
        metric="episode_reward_mean",
        mode="max",
    )

    return results
Example #29
0
def _add_search_to_config(rllib_config, stop_config, hp):
    rllib_config["num_envs_per_worker"] = tune.grid_search([1, 4, 8, 16, 32])
    rllib_config["lr"] = tune.grid_search([0.1, 0.1 * 2, 0.1 * 4])
    rllib_config["model"] = {
        "dim": 3,
        "conv_filters": [[16, [3, 3], 1], [16, [3, 3], 1]],
        "fcnet_hiddens": [256, 256],
    }
    rllib_config["hiddens"] = [32]
    rllib_config["env_config"] = {
        "players_ids": ["player_red", "player_blue"],
        "max_steps": 100,
        "grid_size": 3,
        "get_additional_info": True,
        "temp_mid_step": 0.6,
        "bs_epi_mul": tune.grid_search([2, 4, 8]),
    }
    rllib_config["training_intensity"] = 10

    stop_config["episodes_total"] = tune.grid_search([1000, 2000])

    rllib_config["exploration_config"] = {
        # The Exploration class to use. In the simplest case,
        # this is the name (str) of any class present in the
        # `rllib.utils.exploration` package.
        # You can also provide the python class directly or
        # the full location of your class (e.g.
        # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
        # "type": exploration.SoftQSchedule,
        "type": exploration.SoftQSchedule,
        # Add constructor kwargs here (if any).
        "temperature_schedule": tune.sample_from(
            lambda spec: PiecewiseSchedule(
                endpoints=[
                    (0, 2.0),
                    (
                        int(
                            spec.config["env_config"]["max_steps"]
                            * spec.stop["episodes_total"]
                            * 0.20
                        ),
                        0.5,
                    ),
                    (
                        int(
                            spec.config["env_config"]["max_steps"]
                            * spec.stop["episodes_total"]
                            * spec.config["env_config"]["temp_mid_step"]
                        ),
                        hp["last_exploration_temp_value"],
                    ),
                ],
                outside_value=hp["last_exploration_temp_value"],
                framework="torch",
            )
        ),
    }
    rllib_config["train_batch_size"] = tune.sample_from(
        lambda spec: int(
            spec.config["env_config"]["max_steps"]
            * spec.config["env_config"]["bs_epi_mul"]
        )
    )

    return rllib_config, stop_config
Example #30
0
def _get_rllib_configs(hp, env_class=None):
    stop_config = {
        "episodes_total": 2 if hp["debug"] else hp["n_epi"],
    }

    env_config = {
        "players_ids": ["player_red", "player_blue"],
        "max_steps": hp["n_steps_per_epi"],
        "grid_size": 3,
        "get_additional_info": True,
    }

    env_class = coin_game.CoinGame if env_class is None else env_class
    rllib_config = {
        "env": env_class,
        "env_config": env_config,

        "multiagent": {
            "policies": {
                env_config["players_ids"][0]: (
                    augmented_dqn.MyDQNTorchPolicy,
                    env_class(env_config).OBSERVATION_SPACE,
                    env_class.ACTION_SPACE,
                    {}),
                env_config["players_ids"][1]: (
                    augmented_dqn.MyDQNTorchPolicy,
                    env_class(env_config).OBSERVATION_SPACE,
                    env_class.ACTION_SPACE,
                    {}),
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },

        # === DQN Models ===

        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": tune.sample_from(
            lambda spec: int(spec.config["env_config"]["max_steps"] * 30)),
        # === Replay buffer ===
        # Size of the replay buffer. Note that if async_updates is set, then
        # each worker will have a replay buffer of this size.
        "buffer_size": tune.sample_from(
            lambda spec: int(spec.config["env_config"]["max_steps"] *
                             spec.stop["episodes_total"] * hp["buf_frac"])),
        # Whether to use dueling dqn
        "dueling": False,
        # Whether to use double dqn
        "double_q": True,
        # If True prioritized replay buffer will be used.
        "prioritized_replay": False,

        "rollout_fragment_length": tune.sample_from(
            lambda spec: spec.config["env_config"]["max_steps"]),
        "training_intensity": 10,
        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size": tune.sample_from(
            lambda spec: int(spec.config["env_config"]["max_steps"] *
                             hp["bs_epi_mul"])),
        "batch_mode": "complete_episodes",

        # === Exploration Settings ===
        # Default exploration behavior, iff `explore`=None is passed into
        # compute_action(s).
        # Set to False for no exploration behavior (e.g., for evaluation).
        "explore": True,
        # Provide a dict specifying the Exploration object's config.
        "exploration_config": {
            # The Exploration class to use. In the simplest case,
            # this is the name (str) of any class present in the
            # `rllib.utils.exploration` package.
            # You can also provide the python class directly or
            # the full location of your class (e.g.
            # "ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy").
            # "type": exploration.SoftQSchedule,
            "type": exploration.SoftQSchedule,
            # Add constructor kwargs here (if any).
            "temperature_schedule": tune.sample_from(
                lambda spec: PiecewiseSchedule(
                    endpoints=[
                        (0,
                         2.0),
                        (int(spec.config["env_config"]["max_steps"] *
                             spec.stop["episodes_total"] * 0.20),
                         0.5),
                        (int(spec.config["env_config"]["max_steps"] *
                             spec.stop["episodes_total"] * 0.60),
                         hp["last_exploration_temp_value"])],
                    outside_value=hp["last_exploration_temp_value"],
                    framework="torch")),
        },

        # Size of batches collected from each worker.
        "model": {
            "dim": env_config["grid_size"],
            # [Channel, [Kernel, Kernel], Stride]]
            "conv_filters": [[16, [3, 3], 1], [32, [3, 3], 1]]
        },
        "gamma": 0.96,
        "optimizer": {"sgd_momentum": 0.9, },
        "lr": 0.1,
        "lr_schedule": tune.sample_from(
            lambda spec: [
                (0, 0.0),
                (int(spec.config["env_config"]["max_steps"] *
                     spec.stop["episodes_total"] * 0.05),
                 spec.config.lr),
                (int(spec.config["env_config"]["max_steps"] *
                     spec.stop["episodes_total"]),
                 spec.config.lr / 1e9)
            ]
        ),

        "seed": tune.grid_search(hp["seeds"]),
        "callbacks": log.get_logging_callbacks_class(),
        "framework": "torch",

        "logger_config": {
            "wandb": {
                "project": "DQN_CG",
                "group": hp["exp_name"],
                "api_key_file":
                    os.path.join(os.path.dirname(__file__),
                                 "../../../api_key_wandb"),
                "log_config": True
            },
        },

    }

    return rllib_config, stop_config