def eval_policy( _seed: int, env_name: str, eval_n_timesteps: Optional[int], eval_n_episodes: Optional[int], num_vec: int, parallel: bool, render: bool, render_fps: int, log_dir: str, policy_type: str, policy_path: str, reward_type: Optional[str] = None, reward_path: Optional[str] = None, max_episode_steps: Optional[int] = None, ): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. render: If True, renders interactively to the screen. log_dir: The directory to log intermediate output to. (As of 2019-07-19 this is just episode-by-episode reward from bench.Monitor.) policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. reward_type: If specified, overrides the environment reward with a reward of this. reward_path: If reward_type is specified, the path to a serialized reward of `reward_type` to override the environment reward with. Returns: Return value of `imitation.util.rollout.rollout_stats()`. """ tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('Logging to %s', log_dir) sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) if render: venv = InteractiveRender(venv, render_fps) # TODO(adam): add support for videos using VideoRecorder? with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") with serialize.load_policy(policy_type, policy_path, venv) as policy: trajs = rollout.generate_trajectories(policy, venv, sample_until) return rollout.rollout_stats(trajs)
def eval_policy( _run, _seed: int, env_name: str, eval_n_timesteps: Optional[int], eval_n_episodes: Optional[int], num_vec: int, parallel: bool, render: bool, render_fps: int, videos: bool, video_kwargs: Mapping[str, Any], log_dir: str, policy_type: str, policy_path: str, reward_type: Optional[str] = None, reward_path: Optional[str] = None, max_episode_steps: Optional[int] = None, ): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. render: If True, renders interactively to the screen. render_fps: The target number of frames per second to render on screen. videos: If True, saves videos to `log_dir`. video_kwargs: Keyword arguments passed through to `video_wrapper.VideoWrapper`. log_dir: The directory to log intermediate output to, such as episode reward. policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. reward_type: If specified, overrides the environment reward with a reward of this. reward_path: If reward_type is specified, the path to a serialized reward of `reward_type` to override the environment reward with. Returns: Return value of `imitation.util.rollout.rollout_stats()`. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) logging.basicConfig(level=logging.INFO) logging.info("Logging to %s", log_dir) sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes) post_wrappers = [video_wrapper_factory(log_dir, **video_kwargs) ] if videos else None venv = util.make_vec_env( env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, post_wrappers=post_wrappers, ) try: if render: # As of July 31, 2020, DummyVecEnv rendering only works with num_vec=1 # due to a bug on Stable Baselines 3. venv = InteractiveRender(venv, render_fps) if reward_type is not None: reward_fn = load_reward(reward_type, reward_path, venv) venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") policy = serialize.load_policy(policy_type, policy_path, venv) trajs = rollout.generate_trajectories(policy, venv, sample_until) return rollout.rollout_stats(trajs) finally: venv.close()
def train_preferences( _seed: int, # pylint:disable=invalid-name # Dataset env_name: str, discount: float, num_vec: int, policy_type: str, policy_path: str, # Target specification target_reward_type: str, target_reward_path: str, # Model parameters model_reward_type: regress_utils.EnvRewardFactory, trajectory_length: int, total_timesteps: int, batch_timesteps: int, learning_rate: float, weight_l2_reg: float, reward_l2_reg: float, accuracy_threshold: float, # Logging log_dir: str, ) -> Mapping[str, Any]: """Entry-point into script for synthetic preference comparisons.""" venv = util.make_vec_env(env_name, n_envs=num_vec, seed=_seed) make_source = functools.partial(regress_utils.make_model, model_reward_type) def make_trainer(model, model_scope, target): del target model_params = model_scope.global_variables() batch_size = batch_timesteps // trajectory_length kwargs = {"learning_rate": learning_rate} return preferences.PreferenceComparisonTrainer( model, model_params, batch_size=batch_size, optimizer_kwargs=kwargs, weight_l2_reg=weight_l2_reg, reward_l2_reg=reward_l2_reg, accuracy_threshold=accuracy_threshold, ) with policies_serialize.load_policy(policy_type, policy_path, venv) as policy: def do_training(target, trainer): # Specify in terms of total_timesteps so longer trajectory_length # does not give model more data. total_comparisons = total_timesteps // trajectory_length return trainer.fit_synthetic( venv, policy=policy, target=target, trajectory_length=trajectory_length, total_comparisons=total_comparisons, ) return regress_utils.regress( seed=_seed, env_name=env_name, discount=discount, make_source=make_source, source_init=True, make_trainer=make_trainer, do_training=do_training, target_reward_type=target_reward_type, target_reward_path=target_reward_path, log_dir=log_dir, )
def train(_run, _seed: int, env_name: str, rollout_path: str, normalize: bool, normalize_kwargs: dict, n_expert_demos: Optional[int], log_dir: str, init_trainer_kwargs: dict, total_timesteps: int, n_episodes_eval: int, init_tensorboard: bool, checkpoint_interval: int, dac: bool, rollout_save_n_timesteps: int, rollout_save_n_episodes: int, num_vec: int, parallel: bool, max_episode_steps: Optional[int]) -> dict: """Train an adversarial-network-based imitation learning algorithm. Plots (turn on using `plot_interval > 0`): - Plot discriminator loss during discriminator training steps in blue and discriminator loss during generator training steps in red. - Plot the performance of the generator policy versus the performance of a random policy. Also plot the performance of an expert policy if that is provided in the arguments. Checkpoints: - DiscrimNets are saved to f"{log_dir}/checkpoints/{step}/discrim/", where step is either the training epoch or "final". - Generator policies are saved to f"{log_dir}/checkpoints/{step}/gen_policy/". Args: _seed: Random seed. env_name: The environment to train in. rollout_path: Path to pickle containing list of Trajectories. Used as expert demonstrations. n_expert_demos: The number of expert trajectories to actually use after loading them from `rollout_path`. If None, then use all available trajectories. If `n_expert_demos` is an `int`, then use exactly `n_expert_demos` trajectories, erroring if there aren't enough trajectories. If there are surplus trajectories, then use the first `n_expert_demos` trajectories and drop the rest. log_dir: Directory to save models and other logging to. init_trainer_kwargs: Keyword arguments passed to `init_trainer`, used to initialize the trainer. total_timesteps: The number of transitions to sample from the environment during training. n_episodes_eval: The number of episodes to average over when calculating the average episode reward of the imitation policy for return. plot_interval: The number of epochs between each plot. If negative, then plots are disabled. If zero, then only plot at the end of training. n_plot_episodes: The number of episodes averaged over when calculating the average episode reward of a policy for the performance plots. extra_episode_data_interval: Usually mean episode rewards are calculated immediately before every plot. Set this parameter to a nonnegative number to also add episode reward data points every `extra_episodes_data_interval` epochs. show_plots: Figures are always saved to `f"{log_dir}/plots/*.png"`. If `show_plots` is True, then also show plots as they are created. init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`. checkpoint_interval: Save the discriminator and generator models every `checkpoint_interval` epochs and after training is complete. If 0, then only save weights after training is complete. If <0, then don't save weights at all. Returns: A dictionary with two keys. "imit_stats" gives the return value of `rollout_stats()` on rollouts test-reward-wrapped environment, using the final policy (remember that the ground-truth reward can be recovered from the "monitor_return" key). "expert_stats" gives the return value of `rollout_stats()` on the expert demonstrations loaded from `rollout_path`. """ total_timesteps = int(total_timesteps) tf.logging.info("Logging to %s", log_dir) os.makedirs(log_dir, exist_ok=True) # try: # sacred_util.build_sacred_symlink(log_dir, _run) # except Exception as e: # print("didnt build symlink") # # Calculate stats for expert rollouts. Used for plot and return value. # with open(rollout_path, "rb") as f: # expert_trajs = pickle.load(f) # if n_expert_demos is not None: # assert len(expert_trajs) >= n_expert_demos # expert_trajs = expert_trajs[:n_expert_demos] # # # expert_stats = util.rollout.rollout_stats(expert_trajs) sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, dac=dac) print("type of venv is: ", type(venv)) vec_normalize = None venv = vec_normalize = VecNormalize(venv) print("type of venv is: ", type(venv)) # time.sleep(10) gen_policy_path = os.path.join(log_dir, "checkpoints", "final", "gen_policy") print("gen policy path is: ", gen_policy_path) time.sleep(10) with serialize.load_policy('ppo2', gen_policy_path, venv) as policy: print(policy) print('right before: ', type(venv)) time.sleep(10) util.rollout.save(gen_policy_path, policy, venv, sample_until)