def test_load(self, load_mode, last_epoch): snapshotter = Snapshotter() saved = snapshotter.load(self.temp_dir.name, load_mode) assert isinstance(saved['algo'], VPG) assert isinstance(saved['env'], GymEnv) assert isinstance(saved['algo'].policy, CategoricalMLPPolicy) assert saved['stats'].total_epoch == last_epoch
def watch_atari(saved_dir, env=None, num_episodes=10): """Watch a trained agent play an atari game. Args: saved_dir (str): Directory containing the pickle file. env (str): Environment to run episodes on. If None, the pickled environment is used. num_episodes (int): Number of episodes to play. Note that when using the EpisodicLife wrapper, an episode is considered done when a life is lost. Defaults to 10. """ snapshotter = Snapshotter() data = snapshotter.load(saved_dir) if env is not None: env = gym.make(env) env = Noop(env, noop_max=30) env = MaxAndSkip(env, skip=4) env = EpisodicLife(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireReset(env) env = Grayscale(env) env = Resize(env, 84, 84) env = ClipReward(env) env = StackFrames(env, 4, axis=0) env = GymEnv(env) else: env = data['env'] exploration_policy = data['algo'].exploration_policy exploration_policy.policy._qf.to('cpu') ep_rewards = np.asarray([]) for _ in range(num_episodes): episode_data = rollout(env, exploration_policy.policy, animated=True, pause_per_frame=0.02) ep_rewards = np.append(ep_rewards, np.sum(episode_data['rewards'])) print('Average Reward {}'.format(np.mean(ep_rewards)))
def main(): snapshotter = Snapshotter() with tf.compat.v1.Session(): print("loading model...") data = snapshotter.load(MODEL_PATH) print("model", data) policy = data['algo'].policy env = data['env'] steps, max_steps = 0, 1000 done = False env.render() obs = env.reset() # The initial observation policy.reset() while steps < max_steps and not done: action = policy.get_action(obs)[0] obs, rew, done, _ = env.step(action) env.render( ) # Render the environment to see what's going on (optional) steps += 1 env.close()
def test_load_with_invalid_load_mode(self): snapshotter = Snapshotter() with pytest.raises(ValueError): snapshotter.load(self.temp_dir.name, 'foo')
class LocalRunner: """Base class of local runner. Use Runner.setup(algo, env) to setup algorithm and environement for runner and Runner.train() to start training. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. max_cpus (int): The maximum number of parallel sampler workers. Examples: # to train runner = LocalRunner() env = Env(...) policy = Policy(...) algo = Algo( env=env, policy=policy, ...) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000) # to resume immediately. runner = LocalRunner() runner.restore(resume_from_dir) runner.resume() # to resume with modified training arguments. runner = LocalRunner() runner.restore(resume_from_dir) runner.resume(n_epochs=20) """ def __init__(self, snapshot_config=None, max_cpus=1): if snapshot_config: self._snapshotter = Snapshotter(snapshot_config.snapshot_dir, snapshot_config.snapshot_mode, snapshot_config.snapshot_gap) else: self._snapshotter = Snapshotter() if max_cpus > 1: from garage.sampler import singleton_pool singleton_pool.initialize(max_cpus) self.has_setup = False self.plot = False self._setup_args = None self.train_args = None def setup(self, algo, env, sampler_cls=None, sampler_args=None): """Set up runner for algorithm and environment. This method saves algo and env within runner and creates a sampler. Note: After setup() is called all variables in session should have been initialized. setup() respects existing values in session so policy weights can be loaded before setup(). Args: algo (garage.np.algos.RLAlgorithm): An algorithm instance. env (garage.envs.GarageEnv): An environement instance. sampler_cls (garage.sampler.Sampler): A sampler class. sampler_args (dict): Arguments to be passed to sampler constructor. """ self.algo = algo self.env = env self.policy = self.algo.policy if sampler_args is None: sampler_args = {} if sampler_cls is None: sampler_cls = algo.sampler_cls self.sampler = sampler_cls(env, self.policy, **sampler_args) self.has_setup = True self._setup_args = types.SimpleNamespace(sampler_cls=sampler_cls, sampler_args=sampler_args) def _start_worker(self): """Start Plotter and Sampler workers.""" self.sampler.start_worker() if self.plot: from garage.tf.plotter import Plotter self.plotter = Plotter(self.env, self.policy) self.plotter.start() def _shutdown_worker(self): """Shutdown Plotter and Sampler workers.""" self.sampler.shutdown_worker() if self.plot: self.plotter.close() def obtain_samples(self, itr, batch_size): """Obtain one batch of samples. Args: itr(int): Index of iteration (epoch). batch_size(int): Number of steps in batch. This is a hint that the sampler may or may not respect. Returns: One batch of samples. """ show_pbar = False if self.train_args.n_epoch_cycles == 1: logger.log('Obtaining samples...') show_pbar = True # TODO refactor logging to clean up args of obtain_samples return self.sampler.obtain_samples(itr, log=True, show_pbar=show_pbar) def save(self, epoch, paths=None): """Save snapshot of current batch. Args: itr(int): Index of iteration (epoch). paths(dict): Batch of samples after preprocessed. If None, no paths will be logged to the snapshot. """ if not self.has_setup: raise Exception('Use setup() to setup runner before saving.') logger.log('Saving snapshot...') params = dict() # Save arguments params['setup_args'] = self._setup_args params['train_args'] = self.train_args # Save states params['env'] = self.env # TODO: add this back # params['algo'] = self.algo if paths: params['paths'] = paths params['last_epoch'] = epoch self._snapshotter.save_itr_params(epoch, params) logger.log('Saved') def restore(self, from_dir, from_epoch='last'): """Restore experiment from snapshot. Args: from_dir (str): Directory of the pickle file to resume experiment from. from_epoch (str or int): The epoch to restore from. Can be 'first', 'last' or a number. Not applicable when snapshot_mode='last'. Returns: A SimpleNamespace for train()'s arguments. """ saved = self._snapshotter.load(from_dir, from_epoch) self._setup_args = saved['setup_args'] self.train_args = saved['train_args'] self.setup(env=saved['env'], algo=saved['algo'], sampler_cls=self._setup_args.sampler_cls, sampler_args=self._setup_args.sampler_args) n_epochs = self.train_args.n_epochs last_epoch = saved['last_epoch'] n_epoch_cycles = self.train_args.n_epoch_cycles batch_size = self.train_args.batch_size store_paths = self.train_args.store_paths pause_for_plot = self.train_args.pause_for_plot fmt = '{:<20} {:<15}' logger.log('Restore from snapshot saved in %s' % self._snapshotter.snapshot_dir) logger.log(fmt.format('Train Args', 'Value')) logger.log(fmt.format('n_epochs', n_epochs)) logger.log(fmt.format('last_epoch', last_epoch)) logger.log(fmt.format('n_epoch_cycles', n_epoch_cycles)) logger.log(fmt.format('batch_size', batch_size)) logger.log(fmt.format('store_paths', store_paths)) logger.log(fmt.format('pause_for_plot', pause_for_plot)) self.train_args.start_epoch = last_epoch + 1 return copy.copy(self.train_args) def log_diagnostics(self, pause_for_plot=False): """Log diagnostics. Args: pause_for_plot(bool): Pause for plot. """ logger.log('Time %.2f s' % (time.time() - self._start_time)) logger.log('EpochTime %.2f s' % (time.time() - self._itr_start_time)) logger.log(tabular) if self.plot: self.plotter.update_plot(self.policy, self.algo.max_path_length) if pause_for_plot: input('Plotting evaluation run: Press Enter to " "continue...') def train(self, n_epochs, batch_size, n_epoch_cycles=1, plot=False, store_paths=False, pause_for_plot=False): """Start training. Args: n_epochs(int): Number of epochs. batch_size(int): Number of environment steps in one batch. n_epoch_cycles(int): Number of batches of samples in each epoch. This is only useful for off-policy algorithm. For on-policy algorithm this value should always be 1. plot(bool): Visualize policy by doing rollout after each epoch. store_paths(bool): Save paths in snapshot. pause_for_plot(bool): Pause for plot. Returns: The average return in last epoch cycle. """ if not self.has_setup: raise Exception('Use setup() to setup runner before training.') if hasattr(self.algo, 'n_epoch_cycles'): n_epoch_cycles = self.algo.n_epoch_cycles # Save arguments for restore self.train_args = types.SimpleNamespace(n_epochs=n_epochs, n_epoch_cycles=n_epoch_cycles, batch_size=batch_size, plot=plot, store_paths=store_paths, pause_for_plot=pause_for_plot, start_epoch=0) self.plot = plot return self.algo.train(self, batch_size) def step_epochs(self): """Generator for training. This function serves as a generator. It is used to separate services such as snapshotting, sampler control from the actual training loop. It is used inside train() in each algorithm. The generator initializes two variables: `self.step_itr` and `self.step_path`. To use the generator, these two have to be updated manually in each epoch, as the example shows below. Yields: int: The next training epoch. Examples: for epoch in runner.step_epochs(): runner.step_path = runner.obtain_samples(...) self.train_once(...) runner.step_itr += 1 """ try: self._start_worker() self._start_time = time.time() self.step_itr = (self.train_args.start_epoch * self.train_args.n_epoch_cycles) self.step_path = None for epoch in range(self.train_args.start_epoch, self.train_args.n_epochs): self._itr_start_time = time.time() with logger.prefix('epoch #%d | ' % epoch): yield epoch save_path = (self.step_path if self.train_args.store_paths else None) self.save(epoch, save_path) self.log_diagnostics(self.train_args.pause_for_plot) logger.dump_all(self.step_itr) tabular.clear() finally: self._shutdown_worker() def resume(self, n_epochs=None, batch_size=None, n_epoch_cycles=None, plot=None, store_paths=None, pause_for_plot=None): """Resume from restored experiment. This method provides the same interface as train(). If not specified, an argument will default to the saved arguments from the last call to train(). Returns: The average return in last epoch cycle. """ if self.train_args is None: raise Exception('You must call restore() before resume().') self.train_args.n_epochs = n_epochs or self.train_args.n_epochs self.train_args.batch_size = batch_size or self.train_args.batch_size self.train_args.n_epoch_cycles = (n_epoch_cycles or self.train_args.n_epoch_cycles) if plot is not None: self.train_args.plot = plot if store_paths is not None: self.train_args.store_paths = store_paths if pause_for_plot is not None: self.train_args.pause_for_plot = pause_for_plot return self.algo.train(self, batch_size) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('--render', action='store_true', help='Render trajectories') parser.add_argument('--random', action='store_true', help='Noisy actions') parser.add_argument('--num_samples', type=int, default=int(1e5), help='Num samples to collect') args = parser.parse_args() buffer_data = reset_data() snapshotter = Snapshotter() with tf.compat.v1.Session(): # optional, only for TensorFlow data = snapshotter.load(MODEL_PATH) policy = data['algo'].policy env = data['env'] steps, max_steps = 0, 1000 if args.render: env.render() obs = env.reset() # The initial observation policy.reset() done = False ts = 0 rews = [] for _ in range(args.num_samples): obs = env.reset() # The initial observation policy.reset() done = False rew = 0.0 ts = 0 tot_rew = 0 # if _ % 1000 == 0: print('episode: ', _) for _ in range(max_steps): if args.render: env.render() act, prob = policy.get_action(obs) # act[0] is the actual action, while the second tuple is the done variable. Inspiration: # https://github.com/lcswillems/rl-starter-files/blob/3c7289765883ca681e586b51acf99df1351f8ead/utils/agent.py#L47 append_data(buffer_data, obs, act, prob, done, rew) new_obs, rew, done, _ = env.step(act) # why [0] ? ts += 1 tot_rew += rew if done: # reset target here! random_act = env.action_space.sample() infos = { 'mean': np.random.rand(env.action_space.shape[0]), 'log_std': np.random.rand(env.action_space.shape[0]) } # random action info append_data(buffer_data, new_obs, random_act, infos, done, rew) break else: # continue by setting current obs obs = new_obs rews.append(tot_rew) print('Avg Rew: ', np.mean(rews)) fname = 'generated_hopper_probs.hdf5' dataset = h5py.File(fname, 'w') npify(buffer_data) for key in buffer_data: dataset.create_dataset(key, data=buffer_data[key], compression='gzip') env.close()