def __init__(self, task, policy, population_size=20, sigma=0.5, num_workers=1): """ Initialize the CMA-ES algorithm. Args: task (RLTask, Env): RL task/env to run policy (Policy): specify the policy (model) to optimize population_size (int): size of the population sigma (float): initial standard deviation for CMA-ES num_workers (int): number of workers/jobs to run in parallel """ # create explorer # create evaluator # create updater # super(CEM, self).__init__(self, explorer, evaluator, updater, num_workers=1) if isinstance(task, Env): task = RLTask(task, policy) self.task = task self.policy = policy self.population_size = population_size self.sigma = sigma self.num_workers = num_workers self.es = None self.best_reward = -np.infty self.best_parameters = None
def __init__(self, task=None, policy=None, domain=(-3., 3.), num_workers=1): """ Initialize the Bayesian Optimization algorithm. Args: task (RLTask, Env): RL task/env to run policy (Policy): specify the policy (model) to optimize num_workers (int): number of workers/jobs to run in parallel """ if isinstance(task, Env): task = RLTask(task, policy) self.task = task self.policy = policy self.num_workers = num_workers self.best_reward = -np.infty self.best_parameters = None self.num_steps = 1000 self.num_rollouts = 1 self.verbose = False self.episode = 0 self.render = False self.domain = domain self.rewards = []
def __init__(self, task, policy, population_size=20, elite_fraction=0.2, num_workers=1): """ Initialize the CEM algorithm. Args: task (RLTask, Env): RL task/env to run policy (Policy): specify the policy (model) to optimize population_size (int): size of the population elite_fraction (float): fraction of elites to use to compute the new mean and covariance matrix of the multivariate normal distribution num_workers (int): number of workers/jobs to run in parallel """ # create explorer # create evaluator # create updater # super(CEM, self).__init__(self, explorer, evaluator, updater, num_workers=1) if isinstance(task, Env): task = RLTask(task, policy) self.task = task self.policy = policy self.population_size = population_size self.elite_fraction = elite_fraction self.num_workers = num_workers self.best_reward = -np.infty self.best_parameters = None
def __init__(self, task, policy, num_variations=None, std_dev=0.01, difference_type='central', learning_rate=0.001, normalize_grad=False, num_workers=1): # hyperparameters """ Initialize the FD algorithm. Args: task (RLTask, Env): RL task/env to run policy (Policy): specify the policy (model) to optimize num_variations (None, int): number of times we vary the parameters by a small different increment. If None, it will be twice the number of parameters as according to [1], it yields very accurate gradient estimates. std_dev (float): the small increments are generated from a Normal distribution center at 0 and difference_type (str): there are two difference type of estimators: 'forward' or 'central'. The forward-difference estimator computes the gradient using :math:`J(\theta + \Delta\theta) - J(\theta)`, while the central-difference estimator computes the gradient using :math:`J(\theta + \Delta\theta) - J(\theta - \Delta\theta)` learning_rate (float): learning rate (=coefficient) for the gradient ascent step normalize_grad (bool): specify if we should normalize the gradients num_workers (int): number of workers/jobs to run in parallel """ # create explorer # create evaluator # create updater # super(FD, self).__init__(self, explorer, evaluator, updater, num_workers=1) if isinstance(task, Env): task = RLTask(task, policy) self.task = task self.policy = policy self.num_workers = num_workers # set the number of variations (small increments to vary the parameters) # From [1]: "Empirically it can be observed that taking the number of variations as twice the number # of parameters yields very accurate gradient estimates" if num_variations is None: self.num_variations = 2 * self.policy.num_parameters # set standard deviation self.stddev = np.abs(std_dev) # set difference type if difference_type != 'forward' and difference_type != 'central': raise ValueError("Expecting the 'difference_type' argument to be 'forward' or 'central'. Instead got " "'{}'".format(difference_type)) self.difference_type = difference_type # set other parameters self.lr = learning_rate self.normalize_grad = bool(normalize_grad) # remember best parameters self.best_reward = -np.infty self.best_parameters = None
def task(self, task): """Set the RL task.""" if isinstance(task, (tuple, list)): env, policy = None, None for t in task: if isinstance(t, Env): env = t if isinstance(t, Policy): # TODO if multiple policies policy = t if env is None or policy is None: raise ValueError("Expecting the task to be an instance of `RLTask` or a list/tuple of an environment " "and policy.") task = RLTask(env, policy) if not isinstance(task, RLTask): raise TypeError("Expecting the task to be an instance of `RLTask`, instead got: {}".format(type(task))) self._task = task
def __init__(self, task, policy, std_params=1., num_best_rollouts=10, num_workers=1): """ Initialize the PoWER algorithm. Args: task (RLTask, Env): RL task/env to run. policy (Policy): specify the policy (model) to optimize. std_params (float): standard deviation of the parameters. """ # create explorer # create evaluator # create updater # super(PoWER, self).__init__(task, exploration_strategy, memory, hyperparameters) # set task if isinstance(task, Env): task = RLTask(task, policy) if not isinstance(task, RLTask): raise TypeError("Expecting task to be an instance of RLTask.") self.task = task # set policy self.policy = policy if not self.policy.is_parametric(): raise ValueError("The policy should be parametric") if not self.policy.is_linear(): raise ValueError( "The policy should be linear with respect to the parameters") # set standard deviation of the parameters self.std_params = std_params # set num best rollouts for memory self.num_best_rollouts = num_best_rollouts # remember best parameters self.best_reward = -np.infty self.best_parameters = None
# -*- coding: utf-8 -*- #!/usr/bin/env python """Example on how to use the 'Cartpole' OpenAI Gym environments in PyRoboLearn using a random policy """ from pyrobolearn.envs import gym from pyrobolearn.policies import RandomPolicy from pyrobolearn.tasks import RLTask # create env, state, and action from gym env = gym.make('CartPole-v1') state, action = env.state, env.action print("State and action space: {} and {}".format(state.space, action.space)) # create policy policy = RandomPolicy(state, action) # create task and run it task = RLTask(env, policy) task.run(num_steps=1000, dt=0.02, use_terminating_condition=False, render=True)
def __init__(self, task, policy, population_size=20, species_elitism=2, elitism=2, min_species_size=2, survival_threshold=0.2, max_stagnation=15, compatibility_threshold=3, num_workers=1): r""" Initialize the NEAT algorithm. Args: task (RLTask, Env): RL task/env to run policy (Policy): specify the policy (model) to optimize population_size (int): size of the population elitism (int): The number of most-fit individuals in each species that will be preserved as-is from one generation to the next. num_workers (int): number of workers/jobs to run in parallel """ # create explorer # create evaluator # create updater # super(NEAT, self).__init__(self, explorer, evaluator, updater, num_workers=1) # set task if isinstance(task, Env): task = RLTask(task, policy) self.task = task # set policy if policy is None: policy = self.task.policies[0] # TODO: currently assume only 1 policy if not isinstance(policy, NEATPolicy): raise TypeError("Expecting the policy to be an instance of 'NEATPolicy'.") self.policy = policy # set config file # more info about genome's config file: https://neat-python.readthedocs.io/en/latest/config_file.html # more info about activation fct: https://neat-python.readthedocs.io/en/latest/activation.html config_dict = {'[NEAT]': {'fitness_criterion': 'max', 'fitness_threshold': 100, 'no_fitness_termination': True, 'pop_size': population_size, 'reset_on_extinction': True}, '[DefaultSpeciesSet]': {'compatibility_threshold': compatibility_threshold}, '[DefaultStagnation]': {'species_fitness_func': 'max', 'max_stagnation': max_stagnation, 'species_elitism': species_elitism}, '[DefaultReproduction]': {'elitism': elitism, 'survival_threshold': survival_threshold, 'min_species_size': min_species_size}} # update config file of policy self.policy.update_config(config_dict) # get population self.population = self.policy.population # create useful variables self.num_steps = 1000 self.num_rollouts = 1 self.verbose = False self.episode = 0 self.avg_rewards, self.max_rewards = [], [] self.best_reward = -np.infty self.best_parameters = None