def __init__(self, action_space: Space, *, framework: str,
                 num_workers: Optional[int], worker_index: Optional[int],
                 **kwargs):
        """Create a PerWorkerEpsilonGreedy exploration class.

        Args:
            action_space: The gym action space used by the environment.
            num_workers: The overall number of workers used.
            worker_index: The index of the Worker using this
                Exploration.
            framework: One of None, "tf", "torch".
        """
        epsilon_schedule = None
        # Use a fixed, different epsilon per worker. See: Ape-X paper.
        assert worker_index <= num_workers, (worker_index, num_workers)
        if num_workers > 0:
            if worker_index > 0:
                # From page 5 of https://arxiv.org/pdf/1803.00933.pdf
                alpha, eps, i = 7, 0.4, worker_index - 1
                num_workers_minus_1 = float(num_workers - 1) \
                    if num_workers > 1 else 1.0
                constant_eps = eps**(1 + (i / num_workers_minus_1) * alpha)
                epsilon_schedule = ConstantSchedule(constant_eps,
                                                    framework=framework)
            # Local worker should have zero exploration so that eval
            # rollouts run properly.
            else:
                epsilon_schedule = ConstantSchedule(0.0, framework=framework)

        super().__init__(action_space,
                         epsilon_schedule=epsilon_schedule,
                         framework=framework,
                         num_workers=num_workers,
                         worker_index=worker_index,
                         **kwargs)
    def __init__(self, action_space: Space, *, framework: Optional[str],
                 num_workers: Optional[int], worker_index: Optional[int],
                 **kwargs):
        """
        Args:
            action_space: The gym action space used by the environment.
            num_workers: The overall number of workers used.
            worker_index: The index of the Worker using this
                Exploration.
            framework: One of None, "tf", "torch".
        """
        scale_schedule = None
        # Use a fixed, different epsilon per worker. See: Ape-X paper.
        if num_workers > 0:
            if worker_index > 0:
                num_workers_minus_1 = float(num_workers -
                                            1) if num_workers > 1 else 1.0
                exponent = 1 + (worker_index / num_workers_minus_1) * 7
                scale_schedule = ConstantSchedule(0.4**exponent,
                                                  framework=framework)
            # Local worker should have zero exploration so that eval
            # rollouts run properly.
            else:
                scale_schedule = ConstantSchedule(0.0, framework=framework)

        super().__init__(action_space,
                         scale_schedule=scale_schedule,
                         framework=framework,
                         **kwargs)
Esempio n. 3
0
    def explore(self, trainer, src, dest):
        policy_src = trainer.get_policy(src)
        policy_dest = trainer.get_policy(dest)
        # trainer.get_policy(dest).set_state(trainer.get_policy(src).get_state())

        src_state = copy.deepcopy(policy_src.get_state())

        new_lr = self.explore_helper(policy_src.cur_lr,
                                     self.hyper_params["lr"])
        policy_dest.lr_schedule = ConstantSchedule(new_lr, framework="torch")
        policy_dest.config["cur_lr"] = new_lr

        src_state["_optimizer_variables"][0]["param_groups"][0]["lr"] = new_lr
        policy_dest.set_state(src_state)

        new_clip_param = self.explore_helper(policy_src.config["clip_param"],
                                             self.hyper_params["clip_param"])
        policy_dest.config["clip_param"] = new_clip_param

        new_entropy_coeff = self.explore_helper(
            policy_src.config["entropy_coeff"],
            self.hyper_params["entropy_coeff"])
        policy_dest.entropy_coeff_schedule = ConstantSchedule(
            new_entropy_coeff, framework="torch")

        return {
            "lr": new_lr,
            "clip_param": new_clip_param,
            "entropy_coeff": new_entropy_coeff
        }
    def __init__(self, action_space, *, framework, num_workers, worker_index,
                 **kwargs):
        """
        Args:
            action_space (Space): The gym action space used by the environment.
            num_workers (Optional[int]): The overall number of workers used.
            worker_index (Optional[int]): The index of the Worker using this
                Exploration.
            framework (Optional[str]): One of None, "tf", "torch".
        """
        scale_schedule = None
        # Use a fixed, different epsilon per worker. See: Ape-X paper.
        if num_workers > 0:
            if worker_index >= 0:
                exponent = (1 + worker_index / float(num_workers - 1) * 7)
                scale_schedule = ConstantSchedule(0.4**exponent,
                                                  framework=framework)
            # Local worker should have zero exploration so that eval
            # rollouts run properly.
            else:
                scale_schedule = ConstantSchedule(0.0, framework=framework)

        super().__init__(action_space,
                         scale_schedule=scale_schedule,
                         num_workers=num_workers,
                         worker_index=worker_index,
                         framework=framework,
                         **kwargs)
Esempio n. 5
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = lr
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr, framework=None)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
Esempio n. 6
0
def explore(agent, policy_reward_mean, args):
    """
    Helper function to explore hyperparams (currently just lr)
    """
    sorted_rewards = sorted(policy_reward_mean.items(), key=lambda kv: kv[1])
    upper_quantile = [
        kv[0] for kv in
        sorted_rewards[int(math.floor(args.quantile * -args.num_agents)):]
    ]
    lower_quantile = [
        kv[0] for kv in
        sorted_rewards[:int(math.ceil(args.quantile * args.num_agents))]
    ]
    for agent_id in lower_quantile:
        policy_graph = agent.get_policy(agent_id)
        new_policy_graph = agent.get_policy(random.choice(upper_quantile))
        if "lr" in args.explore_params:
            exemplar = new_policy_graph.cur_lr
            distribution = args.lr
            new_val = explore_helper(exemplar, distribution, args)
            policy_graph.lr_schedule = ConstantSchedule(new_val)
        if "gamma" in args.explore_params:
            param = "gamma"
            exemplar = new_policy_graph.config[param]
            distribution = args.gammas
            new_val = explore_helper(exemplar, distribution, args)
            policy_graph.config[param] = new_val
        if "entropy_coeff" in args.explore_params:
            param = "entropy_coeff"
            exemplar = new_policy_graph.config[param]
            distribution = args.entropy_coeffs
            new_val = explore_helper(exemplar, distribution, args)
            policy_graph.config[param] = new_val
Esempio n. 7
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr, framework=None)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1], framework=None)
Esempio n. 8
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.get_variable("lr", initializer=lr)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1])
Esempio n. 9
0
def make_exploration_schedule(config, worker_index):
    # Use either a different `eps` per worker, or a linear schedule.
    if config["per_worker_exploration"]:
        assert config["num_workers"] > 1, \
            "This requires multiple workers"
        if worker_index >= 0:
            exponent = (1 +
                        worker_index / float(config["num_workers"] - 1) * 7)
            return ConstantSchedule(0.4**exponent)
        else:
            # local ev should have zero exploration so that eval rollouts
            # run properly
            return ConstantSchedule(0.0)
    return LinearSchedule(schedule_timesteps=int(
        config["exploration_fraction"] * config["schedule_max_timesteps"]),
                          initial_p=1.0,
                          final_p=config["exploration_final_eps"])
Esempio n. 10
0
 def _make_exploration_schedule(self, worker_index):
     # Use either a different `eps` per worker, or a linear schedule.
     if self.config["per_worker_exploration"]:
         assert self.config["num_workers"] > 1, \
             "This requires multiple workers"
         return ConstantSchedule(0.4**(
             1 + worker_index / float(self.config["num_workers"] - 1) * 7))
     return LinearSchedule(
         schedule_timesteps=int(self.config["exploration_fraction"] *
                                self.config["schedule_max_timesteps"]),
         initial_p=1.0,
         final_p=self.config["exploration_final_eps"])
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.Variable(lr, name="lr", trainable=False)
     # self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr, framework=None)
     else:
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule,
             interpolation=_left_constant_interpolation,
             outside_value=lr_schedule[-1][-1],
             framework=None,
         )
Esempio n. 12
0
    def __init__(self,
                 action_space,
                 initial_epsilon=1.0,
                 final_epsilon=0.1,
                 epsilon_timesteps=int(1e5),
                 num_workers=0,
                 worker_index=0,
                 framework="tf"):
        """
        Args:
            action_space (Space): The gym action space used by the environment.
            initial_epsilon (float): The initial epsilon value to use.
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
            num_workers (Optional[int]): The overall number of workers used.
            worker_index (Optional[int]): The index of the Worker using this
                Exploration.
            framework (Optional[str]): One of None, "tf", "torch".
        """
        epsilon_schedule = None
        # Use a fixed, different epsilon per worker. See: Ape-X paper.
        if num_workers > 0:
            if worker_index >= 0:
                exponent = (1 + worker_index / float(num_workers - 1) * 7)
                epsilon_schedule = ConstantSchedule(0.4**exponent)
            # Local worker should have zero exploration so that eval
            # rollouts run properly.
            else:
                epsilon_schedule = ConstantSchedule(0.0)

        super().__init__(action_space=action_space,
                         initial_epsilon=initial_epsilon,
                         final_epsilon=final_epsilon,
                         epsilon_timesteps=epsilon_timesteps,
                         num_workers=num_workers,
                         worker_index=worker_index,
                         framework=framework,
                         epsilon_schedule=epsilon_schedule)
Esempio n. 13
0
 def _make_exploration_schedule(self, worker_index):
     # Override DQN's schedule to take into account `noise_scale`
     if self.config["per_worker_exploration"]:
         assert self.config["num_workers"] > 1, \
             "This requires multiple workers"
         if worker_index >= 0:
             exponent = (
                 1 +
                 worker_index / float(self.config["num_workers"] - 1) * 7)
             return ConstantSchedule(self.config["noise_scale"] *
                                     0.4**exponent)
         else:
             # local ev should have zero exploration so that eval rollouts
             # run properly
             return ConstantSchedule(0.0)
     else:
         return LinearSchedule(
             schedule_timesteps=int(self.config["exploration_fraction"] *
                                    self.config["schedule_max_timesteps"]),
             initial_p=self.config["noise_scale"] * 1.0,
             final_p=self.config["noise_scale"] *
             self.config["exploration_final_eps"])
Esempio n. 14
0
def make_exploration_schedule(config, worker_index):
    # Modification of DQN's schedule to take into account
    # `exploration_ou_noise_scale`
    if config["per_worker_exploration"]:
        assert config["num_workers"] > 1, "This requires multiple workers"
        if worker_index >= 0:
            # FIXME: what do magic constants mean? (0.4, 7)
            max_index = float(config["num_workers"] - 1)
            exponent = 1 + worker_index / max_index * 7
            return ConstantSchedule(0.4**exponent)
        else:
            # local ev should have zero exploration so that eval rollouts
            # run properly
            return ConstantSchedule(0.0)
    elif config["exploration_should_anneal"]:
        return LinearSchedule(schedule_timesteps=int(
            config["exploration_fraction"] * config["schedule_max_timesteps"]),
                              initial_p=1.0,
                              final_p=config["exploration_final_scale"])
    else:
        # *always* add exploration noise
        return ConstantSchedule(1.0)
Esempio n. 15
0
def make_exploration_schedule(config, worker_index):
    # Use either a different `eps` per worker, or a linear schedule.
    if config["per_worker_exploration"]:
        assert config["num_workers"] > 1, \
            "This requires multiple workers"
        if worker_index >= 0:
            # Exploration constants from the Ape-X paper
            exponent = (
                1 + worker_index / float(config["num_workers"] - 1) * 7)
            return ConstantSchedule(0.4**exponent)
        else:
            # local ev should have zero exploration so that eval rollouts
            # run properly
            return ConstantSchedule(0.0)

    return PiecewiseSchedule(
        endpoints=[
            (0, config["exploration_initial_eps"]),
            (int(config["exploration_fraction"] *
                 config["schedule_max_timesteps"]),
             config["exploration_final_eps"]),
        ],
        outside_value=config["exploration_final_eps"])
Esempio n. 16
0
 def __init__(self, lr, lr_schedule):
     self.cur_lr = tf.get_variable("lr", initializer=lr, trainable=False)
     if lr_schedule is None:
         self.lr_schedule = ConstantSchedule(lr)
     elif isinstance(lr_schedule, list):
         self.lr_schedule = PiecewiseSchedule(
             lr_schedule, outside_value=lr_schedule[-1][-1])
     elif isinstance(lr_schedule, dict):
         self.lr_schedule = LinearSchedule(
             schedule_timesteps=lr_schedule["schedule_timesteps"],
             initial_p=lr,
             final_p=lr_schedule["final_lr"])
     else:
         raise ValueError('lr_schedule must be either list, dict or None')
Esempio n. 17
0
 def _make_exploration_schedule(self, worker_index):
     # Override DQN's schedule to take into account `noise_scale`
     if self.config["per_worker_exploration"]:
         assert self.config["num_workers"] > 1, \
             "This requires multiple workers"
         return ConstantSchedule(
             self.config["noise_scale"] * 0.4 **
             (1 + worker_index / float(self.config["num_workers"] - 1) * 7))
     else:
         return LinearSchedule(
             schedule_timesteps=int(self.config["exploration_fraction"] *
                                    self.config["schedule_max_timesteps"]),
             initial_p=self.config["noise_scale"] * 1.0,
             final_p=self.config["noise_scale"] *
             self.config["exploration_final_eps"])
    def reset_config(self, new_config):
        config = copy.deepcopy(DEFAULT_CONFIG)
        config.update(new_config)
        self.config = config

        # see LearningRateSchedule.__init__(self, self.config["lr"],self.config["lr_schedule"])
        # in vtrace_policy_graph.py
        # see policy_evaluator.py

        ev = self.optimizer.local_evaluator
        p = ev.policy_map[DEFAULT_POLICY_ID]
        p.lr_schedule = ConstantSchedule(self.config["lr"])
        p.cur_lr.load(self.config["lr"], session=ev.tf_sess)

        return True
Esempio n. 19
0
    def __init__(self, entropy_coeff, entropy_coeff_schedule):
        self.entropy_coeff = entropy_coeff

        if entropy_coeff_schedule is None:
            self.entropy_coeff_schedule = ConstantSchedule(entropy_coeff)
        else:
            # Allows for custom schedule similar to lr_schedule format
            if isinstance(entropy_coeff_schedule, list):
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    entropy_coeff_schedule,
                    outside_value=entropy_coeff_schedule[-1][-1])
            else:
                # Implements previous version but enforces outside_value
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
                    outside_value=0.0)
Esempio n. 20
0
    def __init__(self, entropy_coeff, entropy_coeff_schedule):
        self.entropy_coeff = tf.get_variable(
            "entropy_coeff", initializer=entropy_coeff, trainable=False)

        if entropy_coeff_schedule is None:
            self.entropy_coeff_schedule = ConstantSchedule(
                entropy_coeff, framework=None)
        else:
            # Allows for custom schedule similar to lr_schedule format
            if isinstance(entropy_coeff_schedule, list):
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    entropy_coeff_schedule,
                    outside_value=entropy_coeff_schedule[-1][-1],
                    framework=None)
            else:
                # Implements previous version but enforces outside_value
                self.entropy_coeff_schedule = PiecewiseSchedule(
                    [[0, entropy_coeff], [entropy_coeff_schedule, 0.0]],
                    outside_value=0.0,
                    framework=None)