def train(hparams, output_dir, report_fn=None): """Train.""" hparams = initialize_env_specs(hparams) learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size, FLAGS.output_dir, output_dir) policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(policy_hparams, hparams, hparams.base_algo + "_") total_steps = policy_hparams.epochs_num eval_every_epochs = policy_hparams.eval_every_epochs if eval_every_epochs == 0: eval_every_epochs = total_steps policy_hparams.eval_every_epochs = 0 steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs)) if not steps or steps[-1] < eval_every_epochs: steps.append(eval_every_epochs) metric_name = rl_utils.get_metric_name( sampling_temp=hparams.eval_sampling_temps[0], max_num_noops=hparams.eval_max_num_noops, clipped=False) for step in steps: policy_hparams.epochs_num = step learner.train(hparams.env_fn, policy_hparams, simulated=False, save_continuously=True, epoch=0) eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir) tf.logging.info("Agent eval metrics:\n{}".format( pprint.pformat(eval_metrics))) if report_fn: report_fn(eval_metrics[metric_name], step)
def evaluate( loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir, agent_type, eval_with_learner, log_every_steps, report_fn=None, report_metric=None ): """Evaluate.""" if eval_with_learner: assert agent_type == "policy" if report_fn: assert report_metric is not None eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir) kwargs = {} if not eval_with_learner: kwargs["eval_fn"] = make_eval_fn_with_agent( agent_type, planner_hparams, model_dir, log_every_steps=log_every_steps ) eval_metrics = rl_utils.evaluate_all_configs( loop_hparams, policy_dir, **kwargs ) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0) # Report metrics if report_fn: if report_metric == "mean_reward": metric_name = rl_utils.get_metric_name( sampling_temp=loop_hparams.eval_sampling_temps[0], max_num_noops=loop_hparams.eval_max_num_noops, clipped=False ) report_fn(eval_metrics[metric_name], 0) else: report_fn(eval_metrics[report_metric], 0) return eval_metrics
def evaluate( loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir, agent_type, eval_mode, eval_with_learner, log_every_steps, debug_video_path, num_debug_videos=1, random_starts_step_limit=None, report_fn=None, report_metric=None ): """Evaluate.""" if eval_with_learner: assert agent_type == "policy" if report_fn: assert report_metric is not None eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir) video_writers = () kwargs = {} if eval_mode in ["agent_real", "agent_simulated"]: if not eval_with_learner: if debug_video_path: tf.gfile.MakeDirs(debug_video_path) video_writers = [ common_video.WholeVideoWriter( # pylint: disable=g-complex-comprehension fps=10, output_path=os.path.join(debug_video_path, "{}.avi".format(i)), file_format="avi", ) for i in range(num_debug_videos) ] kwargs["eval_fn"] = make_eval_fn_with_agent( agent_type, eval_mode, planner_hparams, model_dir, log_every_steps=log_every_steps, video_writers=video_writers, random_starts_step_limit=random_starts_step_limit ) eval_metrics = rl_utils.evaluate_all_configs( loop_hparams, policy_dir, **kwargs ) else: eval_metrics = evaluate_world_model( agent_type, loop_hparams, planner_hparams, model_dir, policy_dir, random_starts_step_limit, debug_video_path, log_every_steps ) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0) for video_writer in video_writers: video_writer.finish_to_disk() # Report metrics if report_fn: if report_metric == "mean_reward": metric_name = rl_utils.get_metric_name( sampling_temp=loop_hparams.eval_sampling_temps[0], max_num_noops=loop_hparams.eval_max_num_noops, clipped=False ) report_fn(eval_metrics[metric_name], 0) else: report_fn(eval_metrics[report_metric], 0) return eval_metrics
def evaluate_on_new_model(model_dir_path): global step eval_metrics = rl_utils.evaluate_all_configs(hparams, model_dir_path) tf.logging.info("Agent eval metrics:\n{}".format( pprint.pformat(eval_metrics))) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, step) if report_fn: report_fn(eval_metrics[metric_name], step) step += 1
def evaluate(loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir, agent_type, eval_with_learner, log_every_steps, debug_video_path, report_fn=None, report_metric=None): """Evaluate.""" if eval_with_learner: assert agent_type == "policy" if report_fn: assert report_metric is not None eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir) kwargs = {} if not eval_with_learner: if debug_video_path: video_writer = common_video.WholeVideoWriter( fps=10, output_path=debug_video_path, file_format="avi") else: video_writer = None kwargs["eval_fn"] = make_eval_fn_with_agent( agent_type, planner_hparams, model_dir, log_every_steps=log_every_steps, video_writer=video_writer) eval_metrics = rl_utils.evaluate_all_configs(loop_hparams, policy_dir, **kwargs) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0) if video_writer is not None: video_writer.finish_to_disk() # Report metrics if report_fn: if report_metric == "mean_reward": metric_name = rl_utils.get_metric_name( sampling_temp=loop_hparams.eval_sampling_temps[0], max_num_noops=loop_hparams.eval_max_num_noops, clipped=False) report_fn(eval_metrics[metric_name], 0) else: report_fn(eval_metrics[report_metric], 0) return eval_metrics
def training_loop(hparams, output_dir, report_fn=None, report_metric=None): """Run the main training loop.""" if report_fn: assert report_metric is not None # Directories subdirectories = [ "data", "tmp", "world_model", ("world_model", "debug_videos"), "policy", "eval_metrics" ] directories = setup_directories(output_dir, subdirectories) epoch = -1 data_dir = directories["data"] env = rl_utils.setup_env( hparams, batch_size=hparams.real_batch_size, max_num_noops=hparams.max_num_noops, rl_env_max_episode_steps=hparams.rl_env_max_episode_steps ) env.start_new_epoch(epoch, data_dir) if hparams.wm_policy_param_sharing: policy_model_dir = directories["world_model"] else: policy_model_dir = directories["policy"] learner = rl_utils.LEARNERS[hparams.base_algo]( hparams.frame_stack_size, policy_model_dir, policy_model_dir, hparams.epochs ) # Timing log function log_relative_time = make_relative_timing_fn() # Per-epoch state epoch_metrics = [] metrics = {} # Collect data from the real environment. tf.logging.info("Initial training of the policy in real environment.") train_agent_real_env(env, learner, hparams, epoch) metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward( env.current_epoch_rollouts(), clipped=True ) tf.logging.info("Mean training reward (initial): {}".format( metrics["mean_reward/train/clipped"] )) env.generate_data(data_dir) eval_metrics_writer = tf.summary.FileWriter( directories["eval_metrics"] ) world_model_steps_num = 0 for epoch in range(hparams.epochs): log = make_log_fn(epoch, log_relative_time) # Train world model log("Training world model") world_model_steps_num = train_world_model( env, data_dir, directories["world_model"], hparams, world_model_steps_num, epoch ) # Train agent log("Training policy in simulated environment.") train_agent(env, learner, directories["world_model"], hparams, epoch) env.start_new_epoch(epoch, data_dir) # Train agent on real env (short) log("Training policy in real environment.") train_agent_real_env(env, learner, hparams, epoch) if hparams.stop_loop_early: return 0.0 env.generate_data(data_dir) metrics = load_metrics(directories["eval_metrics"], epoch) if metrics: # Skip eval if metrics have already been written for this epoch. Otherwise # we'd overwrite them with wrong data. log("Metrics found for this epoch, skipping evaluation.") else: metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward( env.current_epoch_rollouts(), clipped=True ) log("Mean training reward: {}".format( metrics["mean_reward/train/clipped"] )) eval_metrics = rl_utils.evaluate_all_configs(hparams, policy_model_dir) log("Agent eval metrics:\n{}".format(pprint.pformat(eval_metrics))) metrics.update(eval_metrics) if hparams.eval_world_model: debug_video_path = os.path.join( directories["world_model", "debug_videos"], "{}.avi".format(env.current_epoch) ) wm_metrics = evaluate_world_model( env, hparams, directories["world_model"], debug_video_path ) log("World model eval metrics:\n{}".format(pprint.pformat(wm_metrics))) metrics.update(wm_metrics) rl_utils.summarize_metrics(eval_metrics_writer, metrics, epoch) # Report metrics if report_fn: if report_metric == "mean_reward": metric_name = rl_utils.get_metric_name( sampling_temp=hparams.eval_sampling_temps[0], max_num_noops=hparams.eval_max_num_noops, clipped=False ) report_fn(eval_metrics[metric_name], epoch) else: report_fn(eval_metrics[report_metric], epoch) epoch_metrics.append(metrics) # Return the evaluation metrics from the final epoch return epoch_metrics[-1]
def train(hparams, output_dir, report_fn=None): """Train.""" hparams = initialize_env_specs(hparams) tf.logging.vlog(1, "HParams in trainer_model_free.train : %s", misc_utils.pprint_hparams(hparams)) tf.logging.vlog(1, "Using hparams.base_algo: %s", hparams.base_algo) learner = rl_utils.LEARNERS[hparams.base_algo](hparams.frame_stack_size, output_dir, output_dir, total_num_epochs=1) policy_hparams = trainer_lib.create_hparams(hparams.base_algo_params) rl_utils.update_hparams_from_hparams(policy_hparams, hparams, hparams.base_algo + "_") tf.logging.vlog(1, "Policy HParams : %s", misc_utils.pprint_hparams(policy_hparams)) total_steps = policy_hparams.epochs_num tf.logging.vlog(2, "total_steps: %d", total_steps) eval_every_epochs = policy_hparams.eval_every_epochs tf.logging.vlog(2, "eval_every_epochs: %d", eval_every_epochs) if eval_every_epochs == 0: eval_every_epochs = total_steps policy_hparams.eval_every_epochs = 0 steps = list(range(eval_every_epochs, total_steps + 1, eval_every_epochs)) if not steps or steps[-1] < eval_every_epochs: steps.append(eval_every_epochs) tf.logging.vlog(1, "steps: [%s]", ",".join([str(s) for s in steps])) metric_name = rl_utils.get_metric_name( sampling_temp=hparams.eval_sampling_temps[0], max_num_noops=hparams.eval_max_num_noops, clipped=False) tf.logging.vlog(1, "metric_name: %s", metric_name) for i, step in enumerate(steps): tf.logging.info("Starting training iteration [%d] for [%d] steps.", i, step) policy_hparams.epochs_num = step learner.train(hparams.env_fn, policy_hparams, simulated=False, save_continuously=True, epoch=0) tf.logging.info("Ended training iteration [%d] for [%d] steps.", i, step) eval_metrics = rl_utils.evaluate_all_configs(hparams, output_dir) tf.logging.info("Agent eval metrics:\n{}".format( pprint.pformat(eval_metrics))) if report_fn: report_fn(eval_metrics[metric_name], step)