def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) if max_cpus > 1: # pylint: disable=import-outside-toplevel from garage.sampler import singleton_pool singleton_pool.initialize(max_cpus) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats(total_itr=0, total_env_steps=0, total_epoch=0, last_path=None) self._algo = None self._env = None self._policy = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None
def __init__(self, snapshot_config): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats(total_itr=0, total_env_steps=0, total_epoch=0, last_path=None) self._algo = None self._env = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None # only used for off-policy algorithms self.enable_logging = True self._n_workers = None self._worker_class = None self._worker_args = None
def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) parallel_sampler.initialize(max_cpus) seed = get_seed() if seed is not None: parallel_sampler.set_seed(seed) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats(total_itr=0, total_env_steps=0, total_epoch=0, last_path=None) self._algo = None self._env = None self._policy = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None
def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) if max_cpus > 1: from garage.sampler import singleton_pool singleton_pool.initialize(max_cpus) self.has_setup = False self.plot = False self._setup_args = None self.train_args = None
def __init__(self, snapshot_config=None, sess=None, max_cpus=1): if snapshot_config: self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) else: self._snapshotter = Snapshotter() if max_cpus > 1: from garage.sampler import singleton_pool singleton_pool.initialize(max_cpus) self.sess = sess or tf.Session() self.sess_entered = False self.has_setup = False self.plot = False self._setup_args = None self.train_args = None
def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter( snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap, ) parallel_sampler.initialize(max_cpus) seed = get_seed() if seed is not None: parallel_sampler.set_seed(seed) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats( total_itr=0, total_env_steps=0, total_epoch=0, last_path=None ) self._algo = None self._env = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None # only used for off-policy algorithms self.enable_logging = True self._n_workers = None self._worker_class = None self._worker_args = None
class LocalRunner: """Base class of local runner. Use Runner.setup(algo, env) to setup algorithm and environement for runner and Runner.train() to start training. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. Note: For the use of any TensorFlow environments, policies and algorithms, please use LocalTFRunner(). Examples: | # to train | runner = LocalRunner() | env = Env(...) | policy = Policy(...) | algo = Algo( | env=env, | policy=policy, | ...) | runner.setup(algo, env) | runner.train(n_epochs=100, batch_size=4000) | # to resume immediately. | runner = LocalRunner() | runner.restore(resume_from_dir) | runner.resume() | # to resume with modified training arguments. | runner = LocalRunner() | runner.restore(resume_from_dir) | runner.resume(n_epochs=20) """ def __init__(self, snapshot_config): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats(total_itr=0, total_env_steps=0, total_epoch=0, last_path=None) self._algo = None self._env = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None # only used for off-policy algorithms self.enable_logging = True self._n_workers = None self._worker_class = None self._worker_args = None def make_sampler(self, sampler_cls, *, seed=None, n_workers=psutil.cpu_count(logical=False), max_episode_length=None, worker_class=None, sampler_args=None, worker_args=None): """Construct a Sampler from a Sampler class. Args: sampler_cls (type): The type of sampler to construct. seed (int): Seed to use in sampler workers. max_episode_length (int): Maximum path length to be sampled by the sampler. Paths longer than this will be truncated. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the Sampler should use. sampler_args (dict or None): Additional arguments that should be passed to the sampler. worker_args (dict or None): Additional arguments that should be passed to the sampler. Raises: ValueError: If `max_episode_length` isn't passed and the algorithm doesn't contain a `max_episode_length` field, or if the algorithm doesn't have a policy field. Returns: sampler_cls: An instance of the sampler class. """ policy = getattr(self._algo, 'exploration_policy', None) if policy is None: policy = getattr(self._algo, 'policy', None) if policy is None: raise ValueError('If the runner is used to construct a sampler, ' 'the algorithm must have a `policy` or ' '`exploration_policy` field.') if max_episode_length is None: if hasattr(self._algo, 'max_episode_length'): max_episode_length = self._algo.max_episode_length if max_episode_length is None: raise ValueError('If `sampler_cls` is specified in runner.setup, ' 'the algorithm must specify `max_episode_length`') if worker_class is None: worker_class = getattr(self._algo, 'worker_cls', DefaultWorker) if seed is None: seed = get_seed() if sampler_args is None: sampler_args = {} if worker_args is None: worker_args = {} return sampler_cls.from_worker_factory(WorkerFactory( seed=seed, max_episode_length=max_episode_length, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args), agents=policy, envs=self._env) def setup(self, algo, env, sampler_cls=None, sampler_args=None, n_workers=psutil.cpu_count(logical=False), worker_class=DefaultWorker, worker_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (garage.np.algos.RLAlgorithm): An algorithm instance. env (garage.envs.GarageEnv): An environement instance. sampler_cls (garage.sampler.Sampler): A sampler class. sampler_args (dict): Arguments to be passed to sampler constructor. n_workers (int): The number of workers the sampler should use. worker_class (type): Type of worker the sampler should use. worker_args (dict or None): Additional arguments that should be passed to the worker. Raises: ValueError: If sampler_cls is passed and the algorithm doesn't contain a `max_episode_length` field. """ self._algo = algo self._env = env self._n_workers = n_workers self._worker_class = worker_class if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = getattr(algo, 'sampler_cls', None) if worker_args is None: worker_args = {} self._worker_args = worker_args if sampler_cls is None: self._sampler = None else: self._sampler = self.make_sampler(sampler_cls, sampler_args=sampler_args, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._has_setup = True self._setup_args = SetupArgs(sampler_cls=sampler_cls, sampler_args=sampler_args, seed=get_seed()) def _start_worker(self): """Start Plotter and Sampler workers.""" if self._plot: # pylint: disable=import-outside-toplevel from garage.plotter import Plotter self._plotter = Plotter() self._plotter.init_plot(self.get_env_copy(), self._algo.policy) def _shutdown_worker(self): """Shutdown Plotter and Sampler workers.""" if self._sampler is not None: self._sampler.shutdown_worker() if self._plot: self._plotter.close() def obtain_trajectories(self, itr, batch_size=None, agent_update=None, env_update=None): """Obtain one batch of trajectories. Args: itr (int): Index of iteration (epoch). batch_size (int): Number of steps in batch. This is a hint that the sampler may or may not respect. agent_update (object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update (object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Raises: ValueError: Raised if the runner was initialized without a sampler, or batch_size wasn't provided here or to train. Returns: TrajectoryBatch: Batch of trajectories. """ if self._sampler is None: raise ValueError('Runner was not initialized with `sampler_cls`. ' 'Either provide `sampler_cls` to runner.setup, ' ' or set `algo.sampler_cls`.') if batch_size is None and self._train_args.batch_size is None: raise ValueError('Runner was not initialized with `batch_size`. ' 'Either provide `batch_size` to runner.train, ' ' or pass `batch_size` to runner.obtain_samples.') paths = None if agent_update is None: agent_update = self._algo.policy.get_param_values() paths = self._sampler.obtain_samples( itr, (batch_size or self._train_args.batch_size), agent_update=agent_update, env_update=env_update) self._stats.total_env_steps += sum(paths.lengths) return paths def obtain_samples(self, itr, batch_size=None, agent_update=None, env_update=None): """Obtain one batch of samples. Args: itr (int): Index of iteration (epoch). batch_size (int): Number of steps in batch. This is a hint that the sampler may or may not respect. agent_update (object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update (object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Raises: ValueError: Raised if the runner was initialized without a sampler, or batch_size wasn't provided here or to train. Returns: list[dict]: One batch of samples. """ trajs = self.obtain_trajectories(itr, batch_size, agent_update, env_update) return trajs.to_trajectory_list() def save(self, epoch): """Save snapshot of current batch. Args: epoch (int): Epoch. Raises: NotSetupError: if save() is called before the runner is set up. """ if not self._has_setup: raise NotSetupError('Use setup() to setup runner before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self._train_args params['stats'] = self._stats # Save states params['env'] = self._env params['algo'] = self._algo params['n_workers'] = self._n_workers params['worker_class'] = self._worker_class params['worker_args'] = self._worker_args self._snapshotter.save_itr_params(epoch, params) logger.log('Saved') def restore(self, from_dir, from_epoch='last'): """Restore experiment from snapshot. Args: from_dir (str): Directory of the pickle file to resume experiment from. from_epoch (str or int): The epoch to restore from. Can be 'first', 'last' or a number. Not applicable when snapshot_mode='last'. Returns: TrainArgs: Arguments for train(). """ saved = self._snapshotter.load(from_dir, from_epoch) self._setup_args = saved['setup_args'] self._train_args = saved['train_args'] self._stats = saved['stats'] set_seed(self._setup_args.seed) self.setup(env=saved['env'], algo=saved['algo'], sampler_cls=self._setup_args.sampler_cls, sampler_args=self._setup_args.sampler_args, n_workers=saved['n_workers'], worker_class=saved['worker_class'], worker_args=saved['worker_args']) n_epochs = self._train_args.n_epochs last_epoch = self._stats.total_epoch last_itr = self._stats.total_itr total_env_steps = self._stats.total_env_steps batch_size = self._train_args.batch_size store_paths = self._train_args.store_paths pause_for_plot = self._train_args.pause_for_plot fmt = '{:<20} {:<15}' logger.log('Restore from snapshot saved in %s' % self._snapshotter.snapshot_dir) logger.log(fmt.format('-- Train Args --', '-- Value --')) logger.log(fmt.format('n_epochs', n_epochs)) logger.log(fmt.format('last_epoch', last_epoch)) logger.log(fmt.format('batch_size', batch_size)) logger.log(fmt.format('store_paths', store_paths)) logger.log(fmt.format('pause_for_plot', pause_for_plot)) logger.log(fmt.format('-- Stats --', '-- Value --')) logger.log(fmt.format('last_itr', last_itr)) logger.log(fmt.format('total_env_steps', total_env_steps)) self._train_args.start_epoch = last_epoch + 1 return copy.copy(self._train_args) def log_diagnostics(self, pause_for_plot=False): """Log diagnostics. Args: pause_for_plot (bool): Pause for plot. """ logger.log('Time %.2f s' % (time.time() - self._start_time)) logger.log('EpochTime %.2f s' % (time.time() - self._itr_start_time)) tabular.record('TotalEnvSteps', self._stats.total_env_steps) logger.log(tabular) if self._plot: self._plotter.update_plot(self._algo.policy, self._algo.max_episode_length) if pause_for_plot: input('Plotting evaluation run: Press Enter to " "continue...') def train(self, n_epochs, batch_size=None, plot=False, store_paths=False, pause_for_plot=False): """Start training. Args: n_epochs (int): Number of epochs. batch_size (int or None): Number of environment steps in one batch. plot (bool): Visualize policy by doing rollout after each epoch. store_paths (bool): Save paths in snapshot. pause_for_plot (bool): Pause for plot. Raises: NotSetupError: If train() is called before setup(). Returns: float: The average return in last epoch cycle. """ if not self._has_setup: raise NotSetupError('Use setup() to setup runner before training.') # Save arguments for restore self._train_args = TrainArgs(n_epochs=n_epochs, batch_size=batch_size, plot=plot, store_paths=store_paths, pause_for_plot=pause_for_plot, start_epoch=0) self._plot = plot average_return = self._algo.train(self) self._shutdown_worker() return average_return def step_epochs(self): """Step through each epoch. This function returns a magic generator. When iterated through, this generator automatically performs services such as snapshotting and log management. It is used inside train() in each algorithm. The generator initializes two variables: `self.step_itr` and `self.step_path`. To use the generator, these two have to be updated manually in each epoch, as the example shows below. Yields: int: The next training epoch. Examples: for epoch in runner.step_epochs(): runner.step_path = runner.obtain_samples(...) self.train_once(...) runner.step_itr += 1 """ self._start_worker() self._start_time = time.time() self.step_itr = self._stats.total_itr self.step_path = None # Used by integration tests to ensure examples can run one epoch. n_epochs = int( os.environ.get('GARAGE_EXAMPLE_TEST_N_EPOCHS', self._train_args.n_epochs)) logger.log('Obtaining samples...') for epoch in range(self._train_args.start_epoch, n_epochs): self._itr_start_time = time.time() with logger.prefix('epoch #%d | ' % epoch): yield epoch save_path = (self.step_path if self._train_args.store_paths else None) self._stats.last_path = save_path self._stats.total_epoch = epoch self._stats.total_itr = self.step_itr self.save(epoch) if self.enable_logging: self.log_diagnostics(self._train_args.pause_for_plot) logger.dump_all(self.step_itr) tabular.clear() def resume(self, n_epochs=None, batch_size=None, plot=None, store_paths=None, pause_for_plot=None): """Resume from restored experiment. This method provides the same interface as train(). If not specified, an argument will default to the saved arguments from the last call to train(). Args: n_epochs (int): Number of epochs. batch_size (int): Number of environment steps in one batch. plot (bool): Visualize policy by doing rollout after each epoch. store_paths (bool): Save paths in snapshot. pause_for_plot (bool): Pause for plot. Raises: NotSetupError: If resume() is called before restore(). Returns: float: The average return in last epoch cycle. """ if self._train_args is None: raise NotSetupError('You must call restore() before resume().') self._train_args.n_epochs = n_epochs or self._train_args.n_epochs self._train_args.batch_size = batch_size or self._train_args.batch_size if plot is not None: self._train_args.plot = plot if store_paths is not None: self._train_args.store_paths = store_paths if pause_for_plot is not None: self._train_args.pause_for_plot = pause_for_plot average_return = self._algo.train(self) self._shutdown_worker() return average_return def get_env_copy(self): """Get a copy of the environment. Returns: garage.envs.GarageEnv: An environement instance. """ if self._env: return cloudpickle.loads(cloudpickle.dumps(self._env)) else: return None @property def total_env_steps(self): """Total environment steps collected. Returns: int: Total environment steps collected. """ return self._stats.total_env_steps
class LocalRunner: """Base class of local runner. Use Runner.setup(algo, env) to setup algorithm and environement for runner and Runner.train() to start training. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. max_cpus (int): The maximum number of parallel sampler workers. Note: For the use of any TensorFlow environments, policies and algorithms, please use LocalTFRunner(). Examples: | # to train | runner = LocalRunner() | env = Env(...) | policy = Policy(...) | algo = Algo( | env=env, | policy=policy, | ...) | runner.setup(algo, env) | runner.train(n_epochs=100, batch_size=4000) | # to resume immediately. | runner = LocalRunner() | runner.restore(resume_from_dir) | runner.resume() | # to resume with modified training arguments. | runner = LocalRunner() | runner.restore(resume_from_dir) | runner.resume(n_epochs=20) """ def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) parallel_sampler.initialize(max_cpus) seed = get_seed() if seed is not None: parallel_sampler.set_seed(seed) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats(total_itr=0, total_env_steps=0, total_epoch=0, last_path=None) self._algo = None self._env = None self._policy = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_path = None def setup(self, algo, env, sampler_cls=None, sampler_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (garage.np.algos.RLAlgorithm): An algorithm instance. env (garage.envs.GarageEnv): An environement instance. sampler_cls (garage.sampler.Sampler): A sampler class. sampler_args (dict): Arguments to be passed to sampler constructor. """ self._algo = algo self._env = env self._policy = self._algo.policy if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = algo.sampler_cls self._sampler = sampler_cls(algo, env, **sampler_args) self._has_setup = True self._setup_args = SetupArgs(sampler_cls=sampler_cls, sampler_args=sampler_args, seed=get_seed()) def _start_worker(self): """Start Plotter and Sampler workers.""" self._sampler.start_worker() if self._plot: # pylint: disable=import-outside-toplevel from garage.tf.plotter import Plotter self._plotter = Plotter(self._env, self._policy) self._plotter.start() def _shutdown_worker(self): """Shutdown Plotter and Sampler workers.""" self._sampler.shutdown_worker() if self._plot: self._plotter.close() def obtain_samples(self, itr, batch_size=None): """Obtain one batch of samples. Args: itr (int): Index of iteration (epoch). batch_size (int): Number of steps in batch. This is a hint that the sampler may or may not respect. Returns: list[dict]: One batch of samples. """ paths = self._sampler.obtain_samples( itr, (batch_size or self._train_args.batch_size)) self._stats.total_env_steps += sum([len(p['rewards']) for p in paths]) return paths def save(self, epoch): """Save snapshot of current batch. Args: epoch (int): Epoch. Raises: NotSetupError: if save() is called before the runner is set up. """ if not self._has_setup: raise NotSetupError('Use setup() to setup runner before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self._train_args params['stats'] = self._stats # Save states params['env'] = self._env params['algo'] = self._algo self._snapshotter.save_itr_params(epoch, params) logger.log('Saved') def restore(self, from_dir, from_epoch='last'): """Restore experiment from snapshot. Args: from_dir (str): Directory of the pickle file to resume experiment from. from_epoch (str or int): The epoch to restore from. Can be 'first', 'last' or a number. Not applicable when snapshot_mode='last'. Returns: TrainArgs: Arguments for train(). """ saved = self._snapshotter.load(from_dir, from_epoch) self._setup_args = saved['setup_args'] self._train_args = saved['train_args'] self._stats = saved['stats'] set_seed(self._setup_args.seed) self.setup(env=saved['env'], algo=saved['algo'], sampler_cls=self._setup_args.sampler_cls, sampler_args=self._setup_args.sampler_args) n_epochs = self._train_args.n_epochs last_epoch = self._stats.total_epoch last_itr = self._stats.total_itr total_env_steps = self._stats.total_env_steps batch_size = self._train_args.batch_size store_paths = self._train_args.store_paths pause_for_plot = self._train_args.pause_for_plot fmt = '{:<20} {:<15}' logger.log('Restore from snapshot saved in %s' % self._snapshotter.snapshot_dir) logger.log(fmt.format('-- Train Args --', '-- Value --')) logger.log(fmt.format('n_epochs', n_epochs)) logger.log(fmt.format('last_epoch', last_epoch)) logger.log(fmt.format('batch_size', batch_size)) logger.log(fmt.format('store_paths', store_paths)) logger.log(fmt.format('pause_for_plot', pause_for_plot)) logger.log(fmt.format('-- Stats --', '-- Value --')) logger.log(fmt.format('last_itr', last_itr)) logger.log(fmt.format('total_env_steps', total_env_steps)) self._train_args.start_epoch = last_epoch + 1 return copy.copy(self._train_args) def log_diagnostics(self, pause_for_plot=False): """Log diagnostics. Args: pause_for_plot (bool): Pause for plot. """ logger.log('Time %.2f s' % (time.time() - self._start_time)) logger.log('EpochTime %.2f s' % (time.time() - self._itr_start_time)) logger.log(tabular) if self._plot: self._plotter.update_plot(self._policy, self._algo.max_path_length) if pause_for_plot: input('Plotting evaluation run: Press Enter to " "continue...') def train(self, n_epochs, batch_size, plot=False, store_paths=False, pause_for_plot=False): """Start training. Args: n_epochs (int): Number of epochs. batch_size (int): Number of environment steps in one batch. plot (bool): Visualize policy by doing rollout after each epoch. store_paths (bool): Save paths in snapshot. pause_for_plot (bool): Pause for plot. Raises: NotSetupError: If train() is called before setup(). Returns: float: The average return in last epoch cycle. """ if not self._has_setup: raise NotSetupError('Use setup() to setup runner before training.') # Save arguments for restore self._train_args = TrainArgs(n_epochs=n_epochs, batch_size=batch_size, plot=plot, store_paths=store_paths, pause_for_plot=pause_for_plot, start_epoch=0) self._plot = plot return self._algo.train(self) def step_epochs(self): """Step through each epoch. This function returns a magic generator. When iterated through, this generator automatically performs services such as snapshotting and log management. It is used inside train() in each algorithm. The generator initializes two variables: `self.step_itr` and `self.step_path`. To use the generator, these two have to be updated manually in each epoch, as the example shows below. Yields: int: The next training epoch. Examples: for epoch in runner.step_epochs(): runner.step_path = runner.obtain_samples(...) self.train_once(...) runner.step_itr += 1 """ self._start_worker() self._start_time = time.time() self.step_itr = self._stats.total_itr self.step_path = None # Used by integration tests to ensure examples can run one epoch. n_epochs = int( os.environ.get('GARAGE_EXAMPLE_TEST_N_EPOCHS', self._train_args.n_epochs)) logger.log('Obtaining samples...') for epoch in range(self._train_args.start_epoch, n_epochs): self._itr_start_time = time.time() with logger.prefix('epoch #%d | ' % epoch): yield epoch save_path = (self.step_path if self._train_args.store_paths else None) self._stats.last_path = save_path self._stats.total_epoch = epoch self._stats.total_itr = self.step_itr self.save(epoch) self.log_diagnostics(self._train_args.pause_for_plot) logger.dump_all(self.step_itr) tabular.clear() def resume(self, n_epochs=None, batch_size=None, plot=None, store_paths=None, pause_for_plot=None): """Resume from restored experiment. This method provides the same interface as train(). If not specified, an argument will default to the saved arguments from the last call to train(). Args: n_epochs (int): Number of epochs. batch_size (int): Number of environment steps in one batch. plot (bool): Visualize policy by doing rollout after each epoch. store_paths (bool): Save paths in snapshot. pause_for_plot (bool): Pause for plot. Raises: NotSetupError: If resume() is called before restore(). Returns: float: The average return in last epoch cycle. """ if self._train_args is None: raise NotSetupError('You must call restore() before resume().') self._train_args.n_epochs = n_epochs or self._train_args.n_epochs self._train_args.batch_size = batch_size or self._train_args.batch_size if plot is not None: self._train_args.plot = plot if store_paths is not None: self._train_args.store_paths = store_paths if pause_for_plot is not None: self._train_args.pause_for_plot = pause_for_plot return self._algo.train(self) def get_env_copy(self): """Get a copy of the environment. Returns: garage.envs.GarageEnv: An environement instance. """ return pickle.loads(pickle.dumps(self._env)) @property def total_env_steps(self): """Total environment steps collected. Returns: int: Total environment steps collected. """ return self._stats.total_env_steps
class LocalRunner: """Base class of local runner. Use Runner.setup(algo, env) to setup algorithm and environement for runner and Runner.train() to start training. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. max_cpus (int): The maximum number of parallel sampler workers. Note: For the use of any TensorFlow environments, policies and algorithms, please use LocalTFRunner(). Examples: | # to train | runner = LocalRunner() | env = Env(...) | policy = Policy(...) | algo = Algo( | env=env, | policy=policy, | ...) | runner.setup(algo, env) | runner.train(n_epochs=100, batch_size=4000) | # to resume immediately. | runner = LocalRunner() | runner.restore(resume_from_dir) | runner.resume() | # to resume with modified training arguments. | runner = LocalRunner() | runner.restore(resume_from_dir) | runner.resume(n_epochs=20) """ def __init__(self, snapshot_config, max_cpus=1): self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) if max_cpus > 1: from garage.sampler import singleton_pool singleton_pool.initialize(max_cpus) self.has_setup = False self.plot = False self._setup_args = None self.train_args = None def setup(self, algo, env, sampler_cls=None, sampler_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (garage.np.algos.RLAlgorithm): An algorithm instance. env (garage.envs.GarageEnv): An environement instance. sampler_cls (garage.sampler.Sampler): A sampler class. sampler_args (dict): Arguments to be passed to sampler constructor. """ self.algo = algo self.env = env self.policy = self.algo.policy if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = algo.sampler_cls self.sampler = sampler_cls(algo, env, **sampler_args) self.has_setup = True self._setup_args = types.SimpleNamespace(sampler_cls=sampler_cls, sampler_args=sampler_args) def _start_worker(self): """Start Plotter and Sampler workers.""" self.sampler.start_worker() if self.plot: from garage.tf.plotter import Plotter self.plotter = Plotter(self.env, self.policy) self.plotter.start() def _shutdown_worker(self): """Shutdown Plotter and Sampler workers.""" self.sampler.shutdown_worker() if self.plot: self.plotter.close() def obtain_samples(self, itr, batch_size=None): """Obtain one batch of samples. Args: itr(int): Index of iteration (epoch). batch_size(int): Number of steps in batch. This is a hint that the sampler may or may not respect. Returns: One batch of samples. """ if self.train_args.n_epoch_cycles == 1: logger.log('Obtaining samples...') return self.sampler.obtain_samples( itr, (batch_size or self.train_args.batch_size)) def save(self, epoch, paths=None): """Save snapshot of current batch. Args: itr(int): Index of iteration (epoch). paths(dict): Batch of samples after preprocessed. If None, no paths will be logged to the snapshot. """ if not self.has_setup: raise Exception('Use setup() to setup runner before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self.train_args # Save states params['env'] = self.env params['algo'] = self.algo if paths: params['paths'] = paths params['last_epoch'] = epoch self._snapshotter.save_itr_params(epoch, params) logger.log('Saved') def restore(self, from_dir, from_epoch='last'): """Restore experiment from snapshot. Args: from_dir (str): Directory of the pickle file to resume experiment from. from_epoch (str or int): The epoch to restore from. Can be 'first', 'last' or a number. Not applicable when snapshot_mode='last'. Returns: A SimpleNamespace for train()'s arguments. """ saved = self._snapshotter.load(from_dir, from_epoch) self._setup_args = saved['setup_args'] self.train_args = saved['train_args'] self.setup(env=saved['env'], algo=saved['algo'], sampler_cls=self._setup_args.sampler_cls, sampler_args=self._setup_args.sampler_args) n_epochs = self.train_args.n_epochs last_epoch = saved['last_epoch'] n_epoch_cycles = self.train_args.n_epoch_cycles batch_size = self.train_args.batch_size store_paths = self.train_args.store_paths pause_for_plot = self.train_args.pause_for_plot fmt = '{:<20} {:<15}' logger.log('Restore from snapshot saved in %s' % self._snapshotter.snapshot_dir) logger.log(fmt.format('Train Args', 'Value')) logger.log(fmt.format('n_epochs', n_epochs)) logger.log(fmt.format('last_epoch', last_epoch)) logger.log(fmt.format('n_epoch_cycles', n_epoch_cycles)) logger.log(fmt.format('batch_size', batch_size)) logger.log(fmt.format('store_paths', store_paths)) logger.log(fmt.format('pause_for_plot', pause_for_plot)) self.train_args.start_epoch = last_epoch + 1 return copy.copy(self.train_args) def log_diagnostics(self, pause_for_plot=False): """Log diagnostics. Args: pause_for_plot(bool): Pause for plot. """ logger.log('Time %.2f s' % (time.time() - self._start_time)) logger.log('EpochTime %.2f s' % (time.time() - self._itr_start_time)) logger.log(tabular) if self.plot: self.plotter.update_plot(self.policy, self.algo.max_path_length) if pause_for_plot: input('Plotting evaluation run: Press Enter to " "continue...') def train(self, n_epochs, batch_size, n_epoch_cycles=1, plot=False, store_paths=False, pause_for_plot=False): """Start training. Args: n_epochs(int): Number of epochs. batch_size(int): Number of environment steps in one batch. n_epoch_cycles(int): Number of batches of samples in each epoch. This is only useful for off-policy algorithm. For on-policy algorithm this value should always be 1. plot(bool): Visualize policy by doing rollout after each epoch. store_paths(bool): Save paths in snapshot. pause_for_plot(bool): Pause for plot. Returns: The average return in last epoch cycle. """ if not self.has_setup: raise Exception('Use setup() to setup runner before training.') # Save arguments for restore self.train_args = types.SimpleNamespace(n_epochs=n_epochs, n_epoch_cycles=n_epoch_cycles, batch_size=batch_size, plot=plot, store_paths=store_paths, pause_for_plot=pause_for_plot, start_epoch=0) self.plot = plot return self.algo.train(self) def step_epochs(self): """Step through each epoch. This function returns a magic generator. When iterated through, this generator automatically performs services such as snapshotting and log management. It is used inside train() in each algorithm. The generator initializes two variables: `self.step_itr` and `self.step_path`. To use the generator, these two have to be updated manually in each epoch, as the example shows below. Yields: int: The next training epoch. Examples: for epoch in runner.step_epochs(): runner.step_path = runner.obtain_samples(...) self.train_once(...) runner.step_itr += 1 """ try: self._start_worker() self._start_time = time.time() self.step_itr = (self.train_args.start_epoch * self.train_args.n_epoch_cycles) self.step_path = None for epoch in range(self.train_args.start_epoch, self.train_args.n_epochs): self._itr_start_time = time.time() with logger.prefix('epoch #%d | ' % epoch): yield epoch save_path = (self.step_path if self.train_args.store_paths else None) self.save(epoch, save_path) self.log_diagnostics(self.train_args.pause_for_plot) logger.dump_all(self.step_itr) tabular.clear() finally: self._shutdown_worker() def resume(self, n_epochs=None, batch_size=None, n_epoch_cycles=None, plot=None, store_paths=None, pause_for_plot=None): """Resume from restored experiment. This method provides the same interface as train(). If not specified, an argument will default to the saved arguments from the last call to train(). Returns: The average return in last epoch cycle. """ if self.train_args is None: raise Exception('You must call restore() before resume().') self.train_args.n_epochs = n_epochs or self.train_args.n_epochs self.train_args.batch_size = batch_size or self.train_args.batch_size self.train_args.n_epoch_cycles = (n_epoch_cycles or self.train_args.n_epoch_cycles) if plot is not None: self.train_args.plot = plot if store_paths is not None: self.train_args.store_paths = store_paths if pause_for_plot is not None: self.train_args.pause_for_plot = pause_for_plot return self.algo.train(self)
class LocalRunner: """This class implements a local runner for tensorflow algorithms. A local runner provides a default tensorflow session using python context. This is useful for those experiment components (e.g. policy) that require a tensorflow session during construction. Use Runner.setup(algo, env) to setup algorithm and environement for runner and Runner.train() to start training. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. max_cpus (int): The maximum number of parallel sampler workers. sess (tf.Session): An optional tensorflow session. A new session will be created immediately if not provided. Note: The local runner will set up a joblib task pool of size max_cpus possibly later used by BatchSampler. If BatchSampler is not used, the processes in the pool will remain dormant. This setup is required to use tensorflow in a multiprocess environment before a tensorflow session is created because tensorflow is not fork-safe. See https://github.com/tensorflow/tensorflow/issues/2448. Examples: with LocalRunner() as runner: env = gym.make('CartPole-v1') policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000) """ def __init__(self, snapshot_config=None, sess=None, max_cpus=1): if snapshot_config: self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) else: self._snapshotter = Snapshotter() if max_cpus > 1: from garage.sampler import singleton_pool singleton_pool.initialize(max_cpus) self.sess = sess or tf.Session() self.sess_entered = False self.has_setup = False self.plot = False self._setup_args = None self.train_args = None def __enter__(self): """Set self.sess as the default session. Returns: This local runner. """ if tf.get_default_session() is not self.sess: self.sess.__enter__() self.sess_entered = True return self def __exit__(self, exc_type, exc_val, exc_tb): """Leave session.""" if tf.get_default_session() is self.sess and self.sess_entered: self.sess.__exit__(exc_type, exc_val, exc_tb) self.sess_entered = False def setup(self, algo, env, sampler_cls=None, sampler_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (garage.np.algos.RLAlgorithm): An algorithm instance. env (garage.envs.GarageEnv): An environement instance. sampler_cls (garage.sampler.Sampler): A sampler class. sampler_args (dict): Arguments to be passed to sampler constructor. """ self.algo = algo self.env = env self.policy = self.algo.policy if sampler_args is None: sampler_args = {} if sampler_cls is None: from garage.tf.algos.batch_polopt import BatchPolopt if isinstance(algo, BatchPolopt): if self.policy.vectorized: from garage.tf.samplers import OnPolicyVectorizedSampler sampler_cls = OnPolicyVectorizedSampler else: from garage.tf.samplers import BatchSampler sampler_cls = BatchSampler else: from garage.tf.samplers import OffPolicyVectorizedSampler sampler_cls = OffPolicyVectorizedSampler self.sampler = sampler_cls(algo, env, **sampler_args) self.initialize_tf_vars() logger.log(self.sess.graph) self.has_setup = True self._setup_args = types.SimpleNamespace(sampler_cls=sampler_cls, sampler_args=sampler_args) def initialize_tf_vars(self): """Initialize all uninitialized variables in session.""" with tf.name_scope('initialize_tf_vars'): uninited_set = [ e.decode() for e in self.sess.run(tf.report_uninitialized_variables()) ] self.sess.run( tf.variables_initializer([ v for v in tf.global_variables() if v.name.split(':')[0] in uninited_set ])) def _start_worker(self): """Start Plotter and Sampler workers.""" self.sampler.start_worker() if self.plot: from garage.tf.plotter import Plotter self.plotter = Plotter(self.env, self.policy) self.plotter.start() def _shutdown_worker(self): """Shutdown Plotter and Sampler workers.""" self.sampler.shutdown_worker() if self.plot: self.plotter.close() def obtain_samples(self, itr, batch_size): """Obtain one batch of samples. Args: itr(int): Index of iteration (epoch). batch_size(int): Number of steps in batch. This is a hint that the sampler may or may not respect. Returns: One batch of samples. """ if self.train_args.n_epoch_cycles == 1: logger.log('Obtaining samples...') return self.sampler.obtain_samples(itr, batch_size) def save(self, epoch, paths=None): """Save snapshot of current batch. Args: itr(int): Index of iteration (epoch). paths(dict): Batch of samples after preprocessed. If None, no paths will be logged to the snapshot. """ if not self.has_setup: raise Exception('Use setup() to setup runner before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self.train_args # Save states params['env'] = self.env params['algo'] = self.algo if paths: params['paths'] = paths params['last_epoch'] = epoch self._snapshotter.save_itr_params(epoch, params) logger.log('Saved') def restore(self, from_dir, from_epoch='last'): """Restore experiment from snapshot. Args: from_dir (str): Directory of the pickle file to resume experiment from. from_epoch (str or int): The epoch to restore from. Can be 'first', 'last' or a number. Not applicable when snapshot_mode='last'. Returns: A SimpleNamespace for train()'s arguments. Examples: 1. Resume experiment immediately. with LocalRunner() as runner: runner.restore(resume_from_dir) runner.resume() 2. Resume experiment with modified training arguments. with LocalRunner() as runner: runner.restore(resume_from_dir) runner.resume(n_epochs=20) Note: When resume via command line, new snapshots will be saved into the SAME directory if not specified. When resume programmatically, snapshot directory should be specify manually or through run_experiment() interface. """ saved = self._snapshotter.load(from_dir, from_epoch) self._setup_args = saved['setup_args'] self.train_args = saved['train_args'] self.setup(env=saved['env'], algo=saved['algo'], sampler_cls=self._setup_args.sampler_cls, sampler_args=self._setup_args.sampler_args) n_epochs = self.train_args.n_epochs last_epoch = saved['last_epoch'] n_epoch_cycles = self.train_args.n_epoch_cycles batch_size = self.train_args.batch_size store_paths = self.train_args.store_paths pause_for_plot = self.train_args.pause_for_plot fmt = '{:<20} {:<15}' logger.log('Restore from snapshot saved in %s' % self._snapshotter.snapshot_dir) logger.log(fmt.format('Train Args', 'Value')) logger.log(fmt.format('n_epochs', n_epochs)) logger.log(fmt.format('last_epoch', last_epoch)) logger.log(fmt.format('n_epoch_cycles', n_epoch_cycles)) logger.log(fmt.format('batch_size', batch_size)) logger.log(fmt.format('store_paths', store_paths)) logger.log(fmt.format('pause_for_plot', pause_for_plot)) self.train_args.start_epoch = last_epoch + 1 return copy.copy(self.train_args) def log_diagnostics(self, pause_for_plot=False): """Log diagnostics. Args: pause_for_plot(bool): Pause for plot. """ logger.log('Time %.2f s' % (time.time() - self._start_time)) logger.log('EpochTime %.2f s' % (time.time() - self._itr_start_time)) logger.log(tabular) if self.plot: self.plotter.update_plot(self.policy, self.algo.max_path_length) if pause_for_plot: input('Plotting evaluation run: Press Enter to " "continue...') def train(self, n_epochs, batch_size, n_epoch_cycles=1, plot=False, store_paths=False, pause_for_plot=False): """Start training. Args: n_epochs(int): Number of epochs. batch_size(int): Number of environment steps in one batch. n_epoch_cycles(int): Number of batches of samples in each epoch. This is only useful for off-policy algorithm. For on-policy algorithm this value should always be 1. plot(bool): Visualize policy by doing rollout after each epoch. store_paths(bool): Save paths in snapshot. pause_for_plot(bool): Pause for plot. Returns: The average return in last epoch cycle. """ if not self.has_setup: raise Exception('Use setup() to setup runner before training.') # Save arguments for restore self.train_args = types.SimpleNamespace(n_epochs=n_epochs, n_epoch_cycles=n_epoch_cycles, batch_size=batch_size, plot=plot, store_paths=store_paths, pause_for_plot=pause_for_plot, start_epoch=0) self.plot = plot return self.algo.train(self, batch_size) def step_epochs(self): """Generator for training. This function serves as a generator. It is used to separate services such as snapshotting, sampler control from the actual training loop. It is used inside train() in each algorithm. The generator initializes two variables: `self.step_itr` and `self.step_path`. To use the generator, these two have to be updated manually in each epoch, as the example shows below. Yields: int: The next training epoch. Examples: for epoch in runner.step_epochs(): runner.step_path = runner.obtain_samples(...) self.train_once(...) runner.step_itr += 1 """ try: self._start_worker() self._start_time = time.time() self.step_itr = (self.train_args.start_epoch * self.train_args.n_epoch_cycles) self.step_path = None for epoch in range(self.train_args.start_epoch, self.train_args.n_epochs): self._itr_start_time = time.time() with logger.prefix('epoch #%d | ' % epoch): yield epoch save_path = (self.step_path if self.train_args.store_paths else None) self.save(epoch, save_path) self.log_diagnostics(self.train_args.pause_for_plot) logger.dump_all(self.step_itr) tabular.clear() finally: self._shutdown_worker() def resume(self, n_epochs=None, batch_size=None, n_epoch_cycles=None, plot=None, store_paths=None, pause_for_plot=None): """Resume from restored experiment. This method provides the same interface as train(). If not specified, an argument will default to the saved arguments from the last call to train(). Returns: The average return in last epoch cycle. """ assert self.train_args is not None, ( 'You must call restore() before resume().') self.train_args.n_epochs = n_epochs or self.train_args.n_epochs self.train_args.batch_size = batch_size or self.train_args.batch_size self.train_args.n_epoch_cycles = (n_epoch_cycles or self.train_args.n_epoch_cycles) if plot is not None: self.train_args.plot = plot if store_paths is not None: self.train_args.store_paths = store_paths if pause_for_plot is not None: self.train_args.pause_for_plot = pause_for_plot return self.algo.train(self, batch_size)
"""Experiment functions.""" from garage.experiment.experiment import run_experiment from garage.experiment.experiment import to_local_command from garage.experiment.experiment import variant from garage.experiment.experiment import VariantGenerator from garage.experiment.snapshotter import Snapshotter # LocalRunner needs snapshotter to be imported, so we have to use a strange # import order here snapshotter = Snapshotter() from garage.experiment.local_tf_runner import LocalRunner # noqa: I100,E402,E501,I202 pylint: disable=wrong-import-position __all__ = [ 'run_experiment', 'to_local_command', 'variant', 'VariantGenerator', 'LocalRunner', 'Snapshotter', 'snapshotter' ]
class IRLTrainer(Trainer): def __init__(self, snapshot_config): super(IRLTrainer, self).__init__(snapshot_config) self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) self._has_setup = False self._plot = False self._setup_args = None self._train_args = None self._stats = ExperimentStats(total_itr=0, total_env_steps=0, total_epoch=0, last_episode=None) self._algo = None self._env = None self._sampler = None self._plotter = None self._start_time = None self._itr_start_time = None self.step_itr = None self.step_episode = None # only used for off-policy algorithms self.enable_logging = True self._n_workers = None self._worker_class = None self._worker_args = None def setup(self, algo, env, irl, baseline, n_itr=200, start_itr=0, sampler_cls=None, sampler_args=None, n_workers=psutil.cpu_count(logical=False), worker_class=None, worker_args=None, discount=0.99, gae_lambda=1, discrim_train_itrs=10, discrim_batch_size=32, irl_model_wt=1.0, zero_environment_reward=False): """ :param discount(float): Discount :param irl_model_wt(float): weight of IRL model """ self._algo = algo self._env = env self._irl = irl self._baseline = baseline self._n_workers = n_workers self._worker_class = worker_class self.n_itr = n_itr self.start_itr = start_itr if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = getattr(algo, 'sampler_cls', None) if worker_class is None: worker_class = getattr(algo, 'worker_cls', DefaultWorker) if worker_args is None: worker_args = {} self._worker_args = worker_args if sampler_cls is None: self._sampler = None else: self._sampler = self.make_sampler(sampler_cls, sampler_args=sampler_args, n_workers=n_workers, worker_class=worker_class, worker_args=worker_args) self._has_setup = True self._setup_args = SetupArgs(sampler_cls=sampler_cls, sampler_args=sampler_args, seed=get_seed()) self.irl_model_wt = irl_model_wt self.discount = discount self.gae_lambda = gae_lambda self.discrim_train_itrs = discrim_train_itrs self.discrim_batch_size = discrim_batch_size self.no_reward = zero_environment_reward def obtain_episodes(self, itr, batch_size=None, agent_update=None, env_update=None): if self._sampler is None: raise ValueError('trainer was not initialized with `sampler_cls`. ' 'Either provide `sampler_cls` to trainer.setup, ' ' or set `algo.sampler_cls`.') episodes = None if agent_update is None: agent_update = self._algo.policy.get_param_values() episodes = self._sampler.obtain_samples( itr, (batch_size or self.batch_size), agent_update=agent_update, env_update=env_update) self._stats.total_env_steps += sum(episodes.lengths) return episodes def obtain_samples(self, itr, batch_size=None, agent_update=None, env_update=None): eps = self.obtain_episodes(itr, batch_size, agent_update, env_update) return eps.to_list() def save(self, epoch, paths=None): if not self._has_setup: raise NotSetupError('Use setup() to setup trainer before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self._train_args params['stats'] = self._stats # Save states params['env'] = self._env params['algo'] = self._algo params['irl'] = self._irl params['n_workers'] = self._n_workers params['worker_class'] = self._worker_class params['worker_args'] = self._worker_args params['paths'] = paths self._snapshotter.save_itr_params(epoch, params) logger.log('Saved') def _train_irl(self, paths, itr=0): if self.no_reward: total_rew = 0. for path in paths: total_rew += np.sum(path['rewards']) path['rewards'] *= 0 tabular.record('OriginalTaskAverageReturn', total_rew / float(len(paths))) if self.irl_model_wt <= 0: return paths max_iters = self.discrim_train_itrs mean_loss = self._irl.train(paths) tabular.record('IRLLoss', mean_loss) self.irl_params = self._irl.get_params() estimated_rewards = self._irl.eval(paths, gamma=self.discount, itr=itr) tabular.record('IRLRewardMean', np.mean(np.concatenate(estimated_rewards))) tabular.record('IRLRewardMean', np.max(np.concatenate(estimated_rewards))) tabular.record('IRLRewardMean', np.min(np.concatenate(estimated_rewards))) # Replace the original reward signal with learned reward signal # This will be used by agents to learn policy if self._irl.score_trajectories: for i, path in enumerate(paths): path['rewards'][-1] += self.irl_model_wt * estimated_rewards[i] else: for i, path in enumerate(paths): path['rewards'] += self.irl_model_wt * estimated_rewards[i] return paths def train(self, n_epochs, batch_size=None, plot=False, store_episodes=False, pause_for_plot=False): """Start training. Args: n_epochs (int): Number of epochs. batch_size (int or None): Number of environment steps in one batch. plot (bool): Visualize an episode from the policy after each epoch. store_episodes (bool): Save episodes in snapshot. pause_for_plot (bool): Pause for plot. Raises: NotSetupError: If train() is called before setup(). Returns: float: The average return in last epoch cycle. """ self.batch_size = batch_size self.store_episodes = store_episodes self.pause_for_plot = pause_for_plot if not self._has_setup: raise NotSetupError( 'Use setup() to setup trainer before training.') self._plot = plot returns = [] for itr in range(self.start_itr, self.n_itr): with logger.prefix(f'itr #{itr} | '): # train policy self._algo.train(self) # compute irl and update reward function logger.log('Obtaining paths...') paths = self.obtain_samples(itr) logger.log('Processing paths...') paths = self._train_irl(paths, itr=itr) samples_data = self.process_samples(itr, paths) logger.log('Logging diagnostics...') logger.log('Time %.2f s' % (time.time() - self._start_time)) logger.log('EpochTime %.2f s' % (time.time() - self._itr_start_time)) tabular.record('TotalEnvSteps', self._stats.total_env_steps) self.log_diagnostics(paths) logger.log('Optimizing policy...') logger.log('Saving snapshot...') self.save(itr, paths=paths) logger.log('Saved') tabular.record('Time', time.time() - self._start_time) tabular.record('ItrTime', time.time() - self._itr_start_time) logger.dump_all(self.step_itr) tabular.clear() self._shutdown_worker() return def step_epochs(self): self._start_worker() self._start_time = time.time() self.step_itr = self._stats.total_itr self.step_episode = None logger.log('Obtaining samples...') yield (1) # for epoch in range(self._train_args.start_epoch, n_epochs): self._itr_start_time = time.time() save_episode = (self.step_episode if self.store_episodes else None) self._stats.last_episode = save_episode self._stats.total_epoch = 1 self._stats.total_itr = self.step_itr def get_env_copy(self): """Get a copy of the environment. Returns: Environment: An environment instance. """ if self._env: return cloudpickle.loads(cloudpickle.dumps(self._env)) else: return None def process_samples(self, itr, paths): baselines, returns = [], [] all_path_baselines = [self._baseline.predict(path) for path in paths] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path['rewards'] + self._algo.discount * path_baselines[ 1:] - path_baselines[:-1] path['advantages'] = special.discount_cumsum( deltas, self._algo.discount * self.gae_lambda) path['returns'] = special.discount_cumsum(path['rewards'], self._algo.discount) baselines.append(path_baselines[:-1]) returns.append(path['returns']) observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) advantages = center_advantages(advantages) return dict( observations=observations, actions=actions, advantages=advantages, rewards=rewards, returns=returns, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) @property def total_env_steps(self): """Total environment steps collected. Returns: int: Total environment steps collected. """ return self._stats.total_env_steps