def setup(self, algo, env): """Set up trainer for algorithm and environment. This method saves algo and env within trainer and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (RLAlgorithm): An algorithm instance. env (Environment): An environment instance. Raises: ValueError: If sampler_cls is passed and the algorithm doesn't contain a `max_episode_length` field. """ self._algo = algo self._env = env self._seed = get_seed() self._sampler = self._algo.sampler self._has_setup = True
def make_sampler(self, sampler_cls, *, seed=None, n_workers=psutil.cpu_count(logical=False), max_path_length=None, worker_class=DefaultWorker, sampler_args=None, worker_args=None): """Construct a Sampler from a Sampler class. Args: sampler_cls (type): The type of sampler to construct. seed (int): Seed to use in sampler workers. max_path_length (int): Maximum path length to be sampled by the sampler. Paths longer than this will be truncated. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the Sampler should use. sampler_args (dict or None): Additional arguments that should be passed to the sampler. worker_args (dict or None): Additional arguments that should be passed to the sampler. Raises: ValueError: If `max_path_length` isn't passed and the algorithm doesn't contain a `max_path_length` field, or if the algorithm doesn't have a policy field. Returns: sampler_cls: An instance of the sampler class. """ if not hasattr(self._algo, 'policy'): raise ValueError('If the runner is used to construct a sampler, ' 'the algorithm must have a `policy` field.') if max_path_length is None: if hasattr(self._algo, 'max_path_length'): max_path_length = self._algo.max_path_length else: raise ValueError('If `sampler_cls` is specified in ' 'runner.setup, the algorithm must have ' 'a `max_path_length` field.') if seed is None: seed = get_seed() if sampler_args is None: sampler_args = {} if worker_args is None: worker_args = {} if issubclass(sampler_cls, BaseSampler): return sampler_cls(self._algo, self._env, **sampler_args) else: return sampler_cls.from_worker_factory(WorkerFactory( seed=seed, max_path_length=max_path_length, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args), agents=self._algo.policy, envs=self._env)
def setup(self, algo, env, sampler_cls=None, sampler_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (garage.np.algos.RLAlgorithm): An algorithm instance. env (garage.envs.GarageEnv): An environement instance. sampler_cls (garage.sampler.Sampler): A sampler class. sampler_args (dict): Arguments to be passed to sampler constructor. """ self._algo = algo self._env = env self._policy = self._algo.policy if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = algo.sampler_cls self._sampler = sampler_cls(algo, env, **sampler_args) self._has_setup = True self._setup_args = SetupArgs(sampler_cls=sampler_cls, sampler_args=sampler_args, seed=get_seed())
def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) parallel_sampler.initialize(max_cpus) seed = get_seed() if seed is not None: parallel_sampler.set_seed(seed) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats(total_itr=0, total_env_steps=0, total_epoch=0, last_path=None) self._algo = None self._env = None self._policy = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None
def setup(self, algo, env): """Set up trainer for algorithm and environment. This method saves algo and env within trainer and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (RLAlgorithm): An algorithm instance. If this algo want to use samplers, it should have a `_sampler` field. env (Environment): An environment instance. """ self._algo = algo self._env = env self._seed = get_seed() if hasattr(self._algo, '_sampler'): # pylint: disable=protected-access self._sampler = self._algo._sampler self._has_setup = True
def __init__( self, agents, envs, *, # After this require passing by keyword. worker_factory=None, max_episode_length=None, is_tf_worker=False, seed=get_seed(), n_workers=psutil.cpu_count(logical=False), worker_class=DefaultWorker, worker_args=None): # pylint: disable=super-init-not-called if not ray.is_initialized(): ray.init(log_to_driver=False, ignore_reinit_error=True) if worker_factory is None and max_episode_length is None: raise TypeError('Must construct a sampler from WorkerFactory or' 'parameters (at least max_episode_length)') if isinstance(worker_factory, WorkerFactory): self._worker_factory = worker_factory else: self._worker_factory = WorkerFactory( max_episode_length=max_episode_length, is_tf_worker=is_tf_worker, seed=seed, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._sampler_worker = ray.remote(SamplerWorker) self._agents = agents self._envs = self._worker_factory.prepare_worker_messages(envs) self._all_workers = defaultdict(None) self._workers_started = False self.start_worker() self.total_env_steps = 0
def setup(self, algo, env, sampler_cls=None, sampler_args=None, n_workers=psutil.cpu_count(logical=False), worker_class=None, worker_args=None): """Set up trainer for algorithm and environment. This method saves algo and env within trainer and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (RLAlgorithm): An algorithm instance. env (Environment): An environment instance. sampler_cls (type): A class which implements :class:`Sampler`. sampler_args (dict): Arguments to be passed to sampler constructor. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the sampler should use. worker_args (dict or None): Additional arguments that should be passed to the worker. Raises: ValueError: If sampler_cls is passed and the algorithm doesn't contain a `max_episode_length` field. """ self._algo = algo self._env = env self._n_workers = n_workers self._worker_class = worker_class if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = getattr(algo, 'sampler_cls', None) if worker_class is None: worker_class = getattr(algo, 'worker_cls', DefaultWorker) if worker_args is None: worker_args = {} self._worker_args = worker_args if sampler_cls is None: self._sampler = None else: self._sampler = self.make_sampler(sampler_cls, sampler_args=sampler_args, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._has_setup = True self._setup_args = SetupArgs(sampler_cls=sampler_cls, sampler_args=sampler_args, seed=get_seed())
def _setup_worker(self, env_indices, tasks): """Setup workers. Args: env_indices (List[Int]): Indices of environments to be assigned to workers for sampling. tasks (List[dict]): List of tasks to assign. """ if self._vec_env is not None: self._vec_env.close() vec_envs = [] for env_ind in env_indices: for _ in range(self._envs_per_worker): vec_env = copy.deepcopy(self.env) vec_env.set_task(tasks[env_ind]) vec_envs.append(vec_env) seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(vec_envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=vec_envs, max_path_length=self.algo.max_path_length)
def evaluate(self, algo, test_rollouts_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate. test_rollouts_per_task (int or None): Number of rollouts per task. """ if test_rollouts_per_task is None: test_rollouts_per_task = self._n_test_rollouts adapted_trajectories = [] logger.log('Sampling for adapation and meta-testing...') if self._test_sampler is None: self._test_sampler = self._sampler_class.from_worker_factory( WorkerFactory(seed=get_seed(), max_path_length=self._max_path_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=self._test_task_sampler.sample(1)) for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = self._trajectory_batch_class.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, test_rollouts_per_task * self._max_path_length, adapted_policy) adapted_trajectories.append(adapted_traj) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, self._trajectory_batch_class.concatenate( *adapted_trajectories), getattr(algo, 'discount', 1.0), trajectory_class=self._trajectory_batch_class, name_map=name_map) self._eval_itr += 1 if self._trajectory_batch_class == TrajectoryBatch: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).rewards else: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).env_rewards return sum(rewards) / len(rewards)
def __init__( self, agents, envs, *, # After this require passing by keyword. worker_factory=None, max_episode_length=None, is_tf_worker=False, seed=get_seed(), n_workers=psutil.cpu_count(logical=False), worker_class=DefaultWorker, worker_args=None): # pylint: disable=super-init-not-called if worker_factory is None and max_episode_length is None: raise TypeError('Must construct a sampler from WorkerFactory or' 'parameters (at least max_episode_length)') if isinstance(worker_factory, WorkerFactory): self._factory = worker_factory else: self._factory = WorkerFactory( max_episode_length=max_episode_length, is_tf_worker=is_tf_worker, seed=seed, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._agents = self._factory.prepare_worker_messages( agents, cloudpickle.dumps) self._envs = self._factory.prepare_worker_messages(envs) self._to_sampler = mp.Queue(2 * self._factory.n_workers) self._to_worker = [mp.Queue(1) for _ in range(self._factory.n_workers)] # If we crash from an exception, with full queues, we would rather not # hang forever, so we would like the process to close without flushing # the queues. # That's what cancel_join_thread does. for q in self._to_worker: q.cancel_join_thread() self._workers = [ mp.Process(target=run_worker, kwargs=dict( factory=self._factory, to_sampler=self._to_sampler, to_worker=self._to_worker[worker_number], worker_number=worker_number, agent=self._agents[worker_number], env=self._envs[worker_number], ), daemon=False) for worker_number in range(self._factory.n_workers) ] self._agent_version = 0 for w in self._workers: w.start() self.total_env_steps = 0
def start_worker(self): """Initialize the sampler.""" n_envs = self.n_envs envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)] # Deterministically set environment seeds based on the global seed. for (i, e) in enumerate(envs): e.seed(deterministic.get_seed() + i) self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length)
def setup(self, algo, env, sampler_cls=None, sampler_args=None, n_workers=psutil.cpu_count(logical=False), worker_class=DefaultWorker, worker_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (garage.np.algos.RLAlgorithm): An algorithm instance. env (garage.envs.GarageEnv): An environement instance. sampler_cls (garage.sampler.Sampler): A sampler class. sampler_args (dict): Arguments to be passed to sampler constructor. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the sampler should use. worker_args (dict or None): Additional arguments that should be passed to the worker. """ self._algo = algo self._env = env self._policy = self._algo.policy self._n_workers = n_workers self._worker_class = worker_class if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = algo.sampler_cls if worker_args is None: worker_args = {} self._worker_args = worker_args self._sampler = self.make_sampler(sampler_cls, sampler_args=sampler_args, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._has_setup = True self._setup_args = SetupArgs(sampler_cls=sampler_cls, sampler_args=sampler_args, seed=get_seed())
def start_worker(self): """Start workers.""" n_envs = self._n_envs envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)] # Deterministically set environment seeds based on the global seed. seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length)
def evaluate(self, algo, test_episodes_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (MetaRLAlgorithm): The algorithm to evaluate. test_episodes_per_task (int or None): Number of episodes per task. """ if test_episodes_per_task is None: test_episodes_per_task = self._n_test_episodes adapted_episodes = [] logger.log('Sampling for adapation and meta-testing...') env_updates = self._test_task_sampler.sample(self._n_test_tasks) if self._test_sampler is None: env = env_updates[0]() self._max_episode_length = env.spec.max_episode_length self._test_sampler = LocalSampler.from_worker_factory( WorkerFactory(seed=get_seed(), max_episode_length=self._max_episode_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=env) for env_up in env_updates: policy = algo.get_exploration_policy() eps = EpisodeBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_eps) ]) adapted_policy = algo.adapt_policy(policy, eps) adapted_eps = self._test_sampler.obtain_samples( self._eval_itr, test_episodes_per_task * self._max_episode_length, adapted_policy) adapted_episodes.append(adapted_eps) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, EpisodeBatch.concatenate(*adapted_episodes), getattr(algo, 'discount', 1.0), name_map=name_map) self._eval_itr += 1
def make_sampler(self, sampler_cls, *, seed=None, n_workers=psutil.cpu_count(logical=False), max_path_length=None, worker_class=DefaultWorker, sampler_args=None, worker_args=None): """Construct a Sampler from a Sampler class. Args: sampler_cls (type): The type of sampler to construct. seed (int): Seed to use in sampler workers. max_path_length (int): Maximum path length to be sampled by the sampler. Paths longer than this will be truncated. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the Sampler should use. sampler_args (dict or None): Additional arguments that should be passed to the sampler. worker_args (dict or None): Additional arguments that should be passed to the sampler. Returns: sampler_cls: An instance of the sampler class. """ if max_path_length is None: max_path_length = self._algo.max_path_length if seed is None: seed = get_seed() if sampler_args is None: sampler_args = {} if worker_args is None: worker_args = {} if issubclass(sampler_cls, BaseSampler): return sampler_cls(self._algo, self._env, **sampler_args) else: return sampler_cls.from_worker_factory(WorkerFactory( seed=seed, max_path_length=max_path_length, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args), agents=self._algo.policy, envs=self._env)
def __init__( self, *, # Require passing by keyword. max_episode_length, is_tf_worker=False, seed=get_seed(), n_workers=psutil.cpu_count(logical=False), worker_class=DefaultWorker, worker_args=None): self.n_workers = n_workers self._seed = seed self._max_episode_length = max_episode_length if is_tf_worker: worker_class = TFWorkerClassWrapper(worker_class) self._worker_class = worker_class if worker_args is None: self._worker_args = {} else: self._worker_args = worker_args
def __init__( self, *, # Require passing by keyword. max_episode_length, is_tf_worker=False, seed=get_seed(), n_workers=psutil.cpu_count(logical=False), worker_class=DefaultWorker, worker_args=None): self.n_workers = n_workers self._seed = seed self._max_episode_length = max_episode_length if is_tf_worker: # Import here to avoid hard dependency on TF. # pylint: disable=import-outside-toplevel from garage.tf.samplers import TFWorkerClassWrapper worker_class = TFWorkerClassWrapper(worker_class) self._worker_class = worker_class if worker_args is None: self._worker_args = {} else: self._worker_args = worker_args
def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter( snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap, ) parallel_sampler.initialize(max_cpus) seed = get_seed() if seed is not None: parallel_sampler.set_seed(seed) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats( total_itr=0, total_env_steps=0, total_epoch=0, last_path=None ) self._algo = None self._env = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None # only used for off-policy algorithms self.enable_logging = True self._n_workers = None self._worker_class = None self._worker_args = None
def __init__( self, agents, envs, *, # After this require passing by keyword. worker_factory=None, max_episode_length=None, is_tf_worker=False, seed=get_seed(), n_workers=psutil.cpu_count(logical=False), worker_class=DefaultWorker, worker_args=None): # pylint: disable=super-init-not-called if worker_factory is None and max_episode_length is None: raise TypeError('Must construct a sampler from WorkerFactory or' 'parameters (at least max_episode_length)') if isinstance(worker_factory, WorkerFactory): self._factory = worker_factory else: self._factory = WorkerFactory( max_episode_length=max_episode_length, is_tf_worker=is_tf_worker, seed=seed, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._agents = self._factory.prepare_worker_messages(agents) self._envs = self._factory.prepare_worker_messages( envs, preprocess=copy.deepcopy) self._workers = [ self._factory(i) for i in range(self._factory.n_workers) ] for worker, agent, env in zip(self._workers, self._agents, self._envs): worker.update_agent(agent) worker.update_env(env) self.total_env_steps = 0
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable b = tf.constant_initializer(b) mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name='mean_std_network') with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('std_network'): std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name='mean_network') # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._std_output_nonlinearity, output_b_init=b, name='std_network') else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(state_input, length=action_dim, initializer=p, trainable=self._learn_std, name='std_network') mean_var = mean_network std_param_var = std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': std_param_var = std_param_var elif self._std_parameterization == 'softplus': std_param_var = tf.log(1. + tf.exp(std_param_var)) else: raise NotImplementedError with tf.variable_scope('std_limits'): if self._min_std_param: std_var = tf.maximum(std_param_var, self._min_std_param) if self._max_std_param: std_var = tf.minimum(std_param_var, self._max_std_param) dist = DiagonalGaussian(action_dim) rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:], seed=deterministic.get_seed()) action_var = rnd * tf.exp(std_var) + mean_var return action_var, mean_var, std_var, std_param_var, dist
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable b = tf.constant_initializer(b) mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name='mean_std_network') with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('std_network'): std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name='mean_network') # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._std_output_nonlinearity, output_b_init=b, name='std_network') else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(state_input, length=action_dim, initializer=p, trainable=self._learn_std, name='std_network') mean_var = mean_network log_std_var = std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': pass elif self._std_parameterization == 'softplus': softplus_std_var = tf.log(1. + tf.exp(log_std_var)) log_std_var = tf.log(softplus_std_var) else: raise NotImplementedError with tf.variable_scope('std_limits'): if self._min_std_param: log_std_var = tf.maximum(log_std_var, self._min_std_param) if self._max_std_param: log_std_var = tf.minimum(log_std_var, self._max_std_param) distribution = tfp.distributions.MultivariateNormalDiag( mean_var, tf.exp(log_std_var)) action_var = distribution.sample(seed=deterministic.get_seed()) return action_var, log_std_var, distribution
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # See the GNU Affero Public License for more details. # # You should have received a copy of the GNU Affero Public License # along with this program. If not, see http://www.gnu.org/licenses. # # http://numenta.org/licenses/ # # ------------------------------------------------------------------------------ import copy from garage.experiment.deterministic import get_seed from garage.sampler import FragmentWorker, LocalSampler from garage.sampler.worker_factory import WorkerFactory seed = get_seed() class SingleVecWorkSampler(LocalSampler): """Sampler class which contains 1 vectorized worker which contains all the envs. The sampler need to be created either from a worker factory or from parameters which can construct a worker factory. See the __init__ method of WorkerFactory for the detail of these parameters. Args: agents (Policy or List[Policy]): Agent(s) to use to sample episodes. If a list is passed in, it must have length exactly `worker_factory.n_workers`, and will be spread across the workers. envs (Environment or List[Environment]): Environment from which
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=tf.constant_initializer(b), name='mean_std_network', layer_normalization=self._layer_normalization) with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('log_std_network'): log_std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, name='mean_network', layer_normalization=self._layer_normalization) # std network if self._adaptive_std: log_std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, output_nonlinearity=self._std_output_nonlinearity, output_w_init=self._std_output_w_init, output_b_init=tf.constant_initializer( self._init_std_param), name='log_std_network', layer_normalization=self._layer_normalization) else: log_std_network = parameter( state_input, length=action_dim, initializer=tf.constant_initializer( self._init_std_param), trainable=self._learn_std, name='log_std_network') mean_var = mean_network std_param = log_std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': log_std_var = std_param else: # we know it must be softplus here log_std_var = tf.log(1. + tf.exp(std_param)) with tf.variable_scope('std_limits'): if self._min_std_param is not None: log_std_var = tf.maximum(log_std_var, self._min_std_param) if self._max_std_param is not None: log_std_var = tf.minimum(log_std_var, self._max_std_param) dist = DiagonalGaussian(self._output_dim) rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:], seed=deterministic.get_seed()) action_var = rnd * tf.exp(log_std_var) + mean_var return action_var, mean_var, log_std_var, std_param, dist
def make_sampler(self, sampler_cls, *, seed=None, n_workers=psutil.cpu_count(logical=False), max_episode_length=None, worker_class=None, sampler_args=None, worker_args=None): """Construct a Sampler from a Sampler class. Args: sampler_cls (type): The type of sampler to construct. seed (int): Seed to use in sampler workers. max_episode_length (int): Maximum episode length to be sampled by the sampler. Epsiodes longer than this will be truncated. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the Sampler should use. sampler_args (dict or None): Additional arguments that should be passed to the sampler. worker_args (dict or None): Additional arguments that should be passed to the sampler. Raises: ValueError: If `max_episode_length` isn't passed and the algorithm doesn't contain a `max_episode_length` field, or if the algorithm doesn't have a policy field. Returns: sampler_cls: An instance of the sampler class. """ policy = getattr(self._algo, 'exploration_policy', None) if policy is None: policy = getattr(self._algo, 'policy', None) if policy is None: raise ValueError('If the trainer is used to construct a sampler, ' 'the algorithm must have a `policy` or ' '`exploration_policy` field.') if max_episode_length is None: if hasattr(self._algo, 'max_episode_length'): max_episode_length = self._algo.max_episode_length if max_episode_length is None: raise ValueError('If `sampler_cls` is specified in trainer.setup, ' 'the algorithm must specify `max_episode_length`') if worker_class is None: worker_class = getattr(self._algo, 'worker_cls', DefaultWorker) if seed is None: seed = get_seed() if sampler_args is None: sampler_args = {} if worker_args is None: worker_args = {} return sampler_cls.from_worker_factory(WorkerFactory( seed=seed, max_episode_length=max_episode_length, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args), agents=policy, envs=self._env)
def _build_graph(self, from_input): latent_dim = self.latent_space.flat_dim small = 1e-5 with self._variable_scope: with tf.variable_scope("dist_params"): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(latent_dim), np.full(latent_dim, self._init_std_param) ], axis=0) b = tf.constant_initializer(b) # b = tf.truncated_normal_initializer( # mean=b, stddev=small) mean_std_network = mlp( with_input=from_input, output_dim=latent_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name="mean_std_network") with tf.variable_scope("mean_network"): mean_network = mean_std_network[..., :latent_dim] with tf.variable_scope("std_network"): std_network = mean_std_network[..., latent_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( with_input=from_input, output_dim=latent_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name="mean_network") # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) # b = tf.truncated_normal_initializer( # mean=self._init_std_param, stddev=small) std_network = mlp( with_input=from_input, output_dim=latent_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name="std_network") else: p = tf.constant_initializer(self._init_std_param) # p = tf.truncated_normal_initializer( # mean=self._init_std_param, stddev=small) std_network = parameter(with_input=from_input, length=latent_dim, initializer=p, trainable=self._learn_std, name="std_network") mean_var = mean_network std_param_var = std_network with tf.variable_scope("std_limits"): if self._min_std_param: std_param_var = tf.maximum(std_param_var, self._min_std_param) if self._max_std_param: std_param_var = tf.minimum(std_param_var, self._max_std_param) with tf.variable_scope("std_parameterization"): # build std_var with std parameterization if self._std_parameterization == "exp": std_var = tf.exp(std_param_var) elif std_parameterization == "softplus": std_var = tf.log(1. + tf.exp(std_param_var)) else: raise NotImplementedError dist = tf.contrib.distributions.MultivariateNormalDiag( mean_var, std_var) latent_var = dist.sample(seed=deterministic.get_seed()) return latent_var, mean_var, std_param_var, dist
def run_task(snapshot_config, exp_config): logger.log(f"Config of this experiment is {exp_config}") env_config = exp_config["env"] env_name = env_config["name"] replay_buffer_size = exp_config.get("replay_buffer_size") n_epochs = exp_config.get("n_epochs") steps_per_epoch = exp_config.get("steps_per_epoch") sampler_batch_size = exp_config.get("sampler_batch_size") n_train_steps = exp_config.get("n_train_steps") learning_rate = exp_config.get("learning_rate") buffer_batch_size = exp_config.get("buffer_batch_size") target_network_update_freq = exp_config.get("target_network_update_freq") min_buffer_size = exp_config.get("min_buffer_size") net_config = exp_config.get("q-net") loss_weights = exp_config.get("loss_weights") deepmdp_config = exp_config.get("deepmdp") epsilon_greedy_config = exp_config.get("epsilon_greedy") plots = exp_config.get("plots") steps = n_epochs * steps_per_epoch * sampler_batch_size num_frames = env_config.get("num_frames") if num_frames is None: num_frames = 4 snapshot_config = SnapshotConfig( os.path.join(os.getcwd(), f'runs/{get_info()}/snapshots'), snapshot_config["snapshot_mode"], snapshot_config["snapshot_gap"]) if "LunarLander-v2" in env_name: # Pass either LunarLander-v2 or LunarLander-v2-img to have the env give back image or semantical observations. if env_name[-4:] == "-img": env = setup_lunar_lander_with_image_obs( env_name[:-4], num_frames, do_noops=env_config["do_noops"]) elif env_name[-4:] == "-obf": env = setup_lunar_lander_with_obfuscated_states( env_name[:-4], num_frames, do_noops=env_config["do_noops"]) elif env_name[-4:] == "-stk": env = setup_stacked_lunar_lander_env( env_name[:-4], num_frames, normalize=env_config["normalize"]) else: env = GarageEnv(gym.make(env_name)) elif "SpaceInvaders-v0" == env_name: env = setup_atari_env(env_name, num_frames) else: raise ValueError("Env name not known") # Set env seed env.seed(get_seed()) # Set seed for action space (needed for epsilon-greedy reproducability) env.action_space.seed(get_seed()) # Init visualizer visualizer = Visualizer(get_info() + "_main", plots) visualizer.publish_config(exp_config) runner = LocalRunner(snapshot_config) replay_buffer = SimpleReplayBuffer(env.spec, size_in_transitions=replay_buffer_size, time_horizon=1) strategy = EpsilonGreedyStrategy(env.spec, steps, **epsilon_greedy_config) qf = DiscreteCNNQFunction(env_spec=env.spec, **net_config) aux_objectives = [] if deepmdp_config["use"]: reward_objective = RewardAuxiliaryObjective( env.spec, qf.embedding_size, deepmdp_config["reward_head"]) transition_objective = TransitionAuxiliaryObjective( env.spec, qf.embedding_size, deepmdp_config["transition_head"]) aux_objectives.append(reward_objective) aux_objectives.append(transition_objective) policy = DiscreteQfDerivedPolicy(env.spec, qf) algo = DQN(policy=policy, qf=qf, env_spec=env.spec, experiment_id=get_info(), plot_list=plots, visualizer=visualizer, replay_buffer=replay_buffer, qf_optimizer=torch.optim.Adam, exploration_strategy=strategy, n_train_steps=n_train_steps, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, n_epoch_cycles=steps_per_epoch, target_network_update_freq=target_network_update_freq, qf_lr=learning_rate, max_path_length=1000, auxiliary_objectives=aux_objectives, **loss_weights) # Use modded off policy sampler for passing generating summary statistics about episode's qvals in algo-object. runner.setup(algo=algo, env=env, sampler_cls=OffPolicyVectorizedSampler) runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) # Bypass GarageEnv>>close as this requires a display env.env.close()
def _build_graph(self, from_input): latent_dim = self.latent_space.flat_dim small = 1e-5 with self._variable_scope: with tf.variable_scope("word2vec"): lstm_cell = tf.nn.rnn_cell.BasicLSTMCell( num_units=self._sentence_embedding_dict_dim) self.sentence_embedding_dict = tf.Variable(np.zeros( (self._sentence_code_dim, self._sentence_embedding_dict_dim)), dtype=tf.float32) # (bs, max_sentence_len, sentence_embedding_dict_dim) self.sentence_embedding = tf.nn.embedding_lookup( params=self.sentence_embedding_dict, ids=from_input) data_mask = tf.cast(from_input, tf.bool) data_len = tf.reduce_sum(tf.cast(data_mask, tf.int32), axis=1) initial_state = lstm_cell.zero_state( tf.shape(self.sentence_embedding)[0], tf.float32) input_vec = tf.nn.dynamic_rnn( lstm_cell, self.sentence_embedding, sequence_length=data_len, initial_state=initial_state)[0][:, -1] with tf.variable_scope("dist_params"): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(latent_dim), np.full(latent_dim, self._init_std_param) ], axis=0) b = tf.constant_initializer(b) mean_std_network = mlp( with_input=input_vec, output_dim=latent_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, # hidden_w_init=tf.orthogonal_initializer(1.0), # output_w_init=tf.orthogonal_initializer(1.0), output_b_init=b, name="mean_std_network") with tf.variable_scope("mean_network"): mean_network = mean_std_network[..., :latent_dim] with tf.variable_scope("std_network"): std_network = mean_std_network[..., latent_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( with_input=input_vec, output_dim=latent_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name="mean_network") # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( with_input=input_vec, output_dim=latent_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name="std_network") else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(with_input=input_vec, length=latent_dim, initializer=p, trainable=self._learn_std, name="std_network") if self._mean_scale != 1.: mean_var = tf.identity(mean_network * self._mean_scale, "mean_scale") else: mean_var = mean_network if self._mean_output_nonlinearity is not None: mean_var = self._mean_output_nonlinearity(mean_var) std_param_var = std_network with tf.variable_scope("std_limits"): if self._min_std_param: std_param_var = tf.maximum(std_param_var, self._min_std_param) if self._max_std_param: std_param_var = tf.minimum(std_param_var, self._max_std_param) with tf.variable_scope("std_parameterization"): # build std_var with std parameterization if self._std_parameterization == "exp": std_var = tf.exp(std_param_var) elif self._std_parameterization == "softplus": std_var = tf.log(1. + tf.exp(std_param_var)) else: raise NotImplementedError if self._normalize: mean_var = tf.nn.l2_normalize(mean_var) #std_var = tf.nn.l2_normalize(std_var) dist = tf.contrib.distributions.MultivariateNormalDiag( mean_var, std_var) latent_var = dist.sample(seed=deterministic.get_seed()) return latent_var, mean_var, std_param_var, dist
def setup(self, algo, env, irl, baseline, n_itr=200, start_itr=0, sampler_cls=None, sampler_args=None, n_workers=psutil.cpu_count(logical=False), worker_class=None, worker_args=None, discount=0.99, gae_lambda=1, discrim_train_itrs=10, discrim_batch_size=32, irl_model_wt=1.0, zero_environment_reward=False): """ :param discount(float): Discount :param irl_model_wt(float): weight of IRL model """ self._algo = algo self._env = env self._irl = irl self._baseline = baseline self._n_workers = n_workers self._worker_class = worker_class self.n_itr = n_itr self.start_itr = start_itr if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = getattr(algo, 'sampler_cls', None) if worker_class is None: worker_class = getattr(algo, 'worker_cls', DefaultWorker) if worker_args is None: worker_args = {} self._worker_args = worker_args if sampler_cls is None: self._sampler = None else: self._sampler = self.make_sampler(sampler_cls, sampler_args=sampler_args, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._has_setup = True self._setup_args = SetupArgs(sampler_cls=sampler_cls, sampler_args=sampler_args, seed=get_seed()) self.irl_model_wt = irl_model_wt self.discount = discount self.gae_lambda = gae_lambda self.discrim_train_itrs = discrim_train_itrs self.discrim_batch_size = discrim_batch_size self.no_reward = zero_environment_reward