Ejemplo n.º 1
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.get_variable("lr", initializer=lr)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1])
Ejemplo n.º 2
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False)
     self._lr_schedule = lr_schedule
     if self._lr_schedule is not None:
         self._lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
         if self.framework == "tf":
             self._lr_placeholder = tf1.placeholder(dtype=tf.float32,
                                                    name="lr")
             self._lr_update = self.cur_lr.assign(self._lr_placeholder,
                                                  read_value=False)
Ejemplo n.º 3
0
class ExtRewardCoeffSchedule:
    @DeveloperAPI
    def __init__(self, ext_reward_coeff, ext_reward_coeff_schedule):
        self.ext_reward_coeff = tf.get_variable(
            "ext_reward_coeff",
            initializer=float(ext_reward_coeff),
            trainable=False)

        if ext_reward_coeff_schedule is None:
            self.ext_reward_coeff_schedule = ConstantSchedule(ext_reward_coeff,
                                                              framework=None)
        else:
            # Allows for custom schedule similar to lr_schedule format
            if isinstance(ext_reward_coeff_schedule, list):
                self.ext_reward_coeff_schedule = PiecewiseSchedule(
                    ext_reward_coeff_schedule,
                    outside_value=ext_reward_coeff_schedule[-1][-1],
                    framework=None)
            else:
                # Implements previous version but enforces outside_value
                self.ext_reward_coeff_schedule = PiecewiseSchedule(
                    [[0, ext_reward_coeff], [ext_reward_coeff_schedule, 0.0]],
                    outside_value=0.0,
                    framework=None)

    @override(Policy)
    def on_global_var_update(self, global_vars):
        super(ExtRewardCoeffSchedule, self).on_global_var_update(global_vars)
        self.ext_reward_coeff.load(self.ext_reward_coeff_schedule.value(
            global_vars["timestep"]),
                                   session=self._sess)
Ejemplo n.º 4
0
    def __init__(self,
                 action_space,
                 *,
                 framework,
                 initial_temperature=1.0,
                 final_temperature=0.0,
                 temperature_timesteps=int(1e5),
                 temperature_schedule=None,
                 **kwargs):
        """Initializes a SoftQ Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            temperature (Schedule): The temperature to divide model outputs by
                before creating the Categorical distribution to sample from.
            framework (str): One of None, "tf", "torch".
            temperature_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert isinstance(action_space, Discrete)
        super().__init__(action_space, framework=framework, **kwargs)

        self.temperature_schedule = \
            from_config(Schedule, temperature_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_temperature), (temperature_timesteps, final_temperature)],
                outside_value=final_temperature,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")
        self.temperature = self.temperature_schedule(self.last_timestep)
Ejemplo n.º 5
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr)
     elif isinstance(lr_schedule, list):
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1])
     elif isinstance(lr_schedule, dict):
         self.lr_schedule = LinearSchedule(
             schedule_timesteps=lr_schedule["schedule_timesteps"],
             initial_p=lr,
             final_p=lr_schedule["final_lr"])
     else:
         raise ValueError('lr_schedule must be either list, dict or None')
Ejemplo n.º 6
0
class ManualLearningRateSchedule:
    """Mixin for TFPolicy that adds a learning rate schedule."""
    def __init__(self, lr, lr_schedule):
        self.cur_lr = lr
        if lr_schedule is None:
            self.lr_schedule = ConstantSchedule(lr, framework=None)
        else:
            self.lr_schedule = PiecewiseSchedule(
                lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)

    # not called automatically by any rllib logic, call this in your training script or a trainer callback
    def update_lr(self, timesteps_total):
        print(f"cur lr {self.cur_lr}")
        self.cur_lr = self.lr_schedule.value(timesteps_total)
        for opt in self._optimizers:
            for p in opt.param_groups:
                p["lr"] = self.cur_lr
Ejemplo n.º 7
0
    def __init__(self,
                 action_space,
                 *,
                 framework: str,
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
                 epsilon_schedule=None,
                 **kwargs):
        """Create an EpsilonGreedy exploration class.

        Args:
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
        """
        assert framework is not None
        super().__init__(action_space=action_space,
                         framework=framework,
                         **kwargs)

        self.epsilon_schedule = \
            from_config(Schedule, epsilon_schedule, framework=framework) or \
            PiecewiseSchedule(
                endpoints=[
                    (0, initial_epsilon), (epsilon_timesteps, final_epsilon)],
                outside_value=final_epsilon,
                framework=self.framework)

        # The current timestep value (tf-var or python int).
        self.last_timestep = get_variable(0,
                                          framework=framework,
                                          tf_name="timestep")

        # Build the tf-info-op.
        if self.framework == "tf":
            raise ValueError("Torch version does not support "
                             "multiobj episilon-greedy yet!")
Ejemplo n.º 8
0
class LearningRateSchedule(object):
    """Mixin for TFPolicyGraph that adds a learning rate schedule."""

    def __init__(self, lr, lr_schedule):
        self.cur_lr = tf.get_variable("lr", initializer=lr)
        if lr_schedule is None:
            self.lr_schedule = ConstantSchedule(lr)
        else:
            self.lr_schedule = PiecewiseSchedule(
                lr_schedule, outside_value=lr_schedule[-1][-1])

    @override(PolicyGraph)
    def on_global_var_update(self, global_vars):
        super(LearningRateSchedule, self).on_global_var_update(global_vars)
        self.cur_lr.load(
            self.lr_schedule.value(global_vars["timestep"]),
            session=self._sess)

    @override(TFPolicyGraph)
    def optimizer(self):
        return tf.train.AdamOptimizer(self.cur_lr)