def log( self, log_writer: SummaryWriter, task_outputs: Optional[List[Any]], render: Optional[Dict[str, List[Dict[str, Any]]]], num_steps: int, ): viz_order, all_episodes = self._auto_viz_order(task_outputs) if viz_order is None: get_logger().debug("trajectory viz returning without visualizing") return for page, current_ids in enumerate(viz_order): figs = [] for episode_id in current_ids: # assert episode_id in all_episodes if episode_id not in all_episodes: get_logger().warning( "skipping viz for missing episode {}".format(episode_id) ) continue figs.append(self.make_fig(all_episodes[episode_id], episode_id)) if len(figs) == 0: continue log_writer.add_figure( "{}/{}_group{}".format(self.mode, self.label, page), figs, global_step=num_steps, ) plt.close( "all" ) # close all current figures (SummaryWriter already closes all figures we log)
def log( self, log_writer: SummaryWriter, task_outputs: Optional[List[Any]], render: Optional[Dict[str, List[Dict[str, Any]]]], num_steps: int, ): if render is None: return if ( self.action_names is None and task_outputs is not None and len(task_outputs) > 0 and self.action_names_path is not None ): self.action_names = list( self._access(task_outputs[0], self.action_names_path) ) viz_order, _ = self._auto_viz_order(task_outputs) if viz_order is None: get_logger().debug("actor viz returning without visualizing") return for page, current_ids in enumerate(viz_order): figs = [] for episode_id in current_ids: # assert episode_id in render if episode_id not in render: get_logger().warning( "skipping viz for missing episode {}".format(episode_id) ) continue episode_src = [ step["actor_probs"] for step in render[episode_id] if "actor_probs" in step ] assert len(episode_src) == len(render[episode_id]) figs.append(self.make_fig(episode_src, episode_id)) if len(figs) == 0: continue log_writer.add_figure( "{}/{}_group{}".format(self.mode, self.label, page), figs, global_step=num_steps, ) plt.close( "all" ) # close all current figures (SummaryWriter already closes all figures we log)
def log( self, log_writer: SummaryWriter, task_outputs: Optional[List[Any]], render: Optional[Dict[str, List[Dict[str, Any]]]], num_steps: int, ): if render is None: return viz_order, _ = self._auto_viz_order(task_outputs) if viz_order is None: get_logger().debug("tensor viz returning without visualizing") return for page, current_ids in enumerate(viz_order): figs = [] for episode_id in current_ids: if episode_id not in render or len(render[episode_id]) == 0: get_logger().warning( "skipping viz for missing or 0-length episode {}".format( episode_id ) ) continue episode_src = [ step[self.datum_id] for step in render[episode_id] if self.datum_id in step ] if len(episode_src) > 0: # If the last episode for an inference worker is of length 1, there's no captured rollout sources figs.append(self.make_fig(episode_src, episode_id)) if len(figs) == 0: continue log_writer.add_figure( "{}/{}_group{}".format(self.mode, self.label, page), figs, global_step=num_steps, ) plt.close( "all" ) # close all current figures (SummaryWriter already closes all figures we log)
def log( self, log_writer: SummaryWriter, task_outputs: Optional[List[Any]], render: Optional[Dict[str, List[Dict[str, Any]]]], num_steps: int, ): if render is None: return datum_id = self._source_to_str(self.vector_task_sources[0], is_vector_task=True) viz_order, _ = self._auto_viz_order(task_outputs) if viz_order is None: get_logger().debug("agent view viz returning without visualizing") return for page, current_ids in enumerate(viz_order): images = [] # list of lists of rgb frames for episode_id in current_ids: # assert episode_id in render if episode_id not in render: get_logger().warning( "skipping viz for missing episode {}".format(episode_id) ) continue images.append( [ self._overlay_label(step[datum_id], episode_id) for step in render[episode_id] ] ) if len(images) == 0: continue vid = self.make_vid(images) if vid is not None: log_writer.add_vid( "{}/{}_group{}".format(self.mode, self.label, page), vid, global_step=num_steps, )
def log( self, start_time_str: str, nworkers: int, test_steps: Sequence[int] = (), metrics_file: Optional[str] = None, ): finalized = False log_writer: Optional[SummaryWriter] = None if not self.disable_tensorboard: log_writer = SummaryWriter( log_dir=self.log_writer_path(start_time_str), filename_suffix="__{}_{}".format(self.mode, self.local_start_time_str), ) # To aggregate/buffer metrics from trainers/testers collected: List[LoggingPackage] = [] last_train_steps = 0 last_offpolicy_steps = 0 last_train_time = time.time() # test_steps = sorted(test_steps, reverse=True) test_results: List[Dict] = [] unfinished_workers = nworkers try: while True: try: package: Union[LoggingPackage, Union[ Tuple[str, Any], Tuple[str, Any, Any]]] = self.queues["results"].get(timeout=1) if isinstance(package, LoggingPackage): pkg_mode = package.mode if pkg_mode == "train": collected.append(package) if len(collected) >= nworkers: collected = sorted( collected, key=lambda pkg: ( pkg.training_steps, pkg.off_policy_steps, ), ) if ( collected[nworkers - 1].training_steps == collected[0].training_steps and collected[nworkers - 1].off_policy_steps == collected[0].off_policy_steps ): # ensure nworkers have provided the same num_steps ( last_train_steps, last_offpolicy_steps, last_train_time, ) = self.process_train_packages( log_writer=log_writer, pkgs=collected[:nworkers], last_steps=last_train_steps, last_offpolicy_steps= last_offpolicy_steps, last_time=last_train_time, ) collected = collected[nworkers:] elif len(collected) > 2 * nworkers: get_logger().warning( "Unable to aggregate train packages from all {} workers" "after {} packages collected".format( nworkers, len(collected))) elif pkg_mode == "valid": # they all come from a single worker if (package.training_steps is not None): # no validation samplers self.process_eval_package( log_writer=log_writer, pkg=package) if ( finalized and self.queues["checkpoints"].empty() ): # assume queue is actually empty after trainer finished and no checkpoints in queue break elif pkg_mode == "test": collected.append(package) if len(collected) >= nworkers: collected = sorted( collected, key=lambda x: x.training_steps ) # sort by num_steps if ( collected[nworkers - 1].training_steps == collected[0].training_steps ): # ensure nworkers have provided the same num_steps self.process_test_packages( log_writer=log_writer, pkgs=collected[:nworkers], all_results=test_results, ) collected = collected[nworkers:] with open(metrics_file, "w") as f: json.dump( test_results, f, indent=4, sort_keys=True, cls=NumpyJSONEncoder, ) get_logger().info( "Updated {} up to checkpoint {}". format( metrics_file, test_steps[len(test_results) - 1], )) else: get_logger().error( f"Runner received unknown package of type {pkg_mode}" ) else: pkg_mode = package[0] if pkg_mode == "train_stopped": if package[1] == 0: finalized = True if not self.running_validation: get_logger().info( "Terminating runner after trainer done (no validation)" ) break else: raise Exception( "Train worker {} abnormally terminated". format(package[1] - 1)) elif pkg_mode == "valid_stopped": raise Exception( "Valid worker {} abnormally terminated".format( package[1] - 1)) elif pkg_mode == "test_stopped": if package[1] == 0: unfinished_workers -= 1 if unfinished_workers == 0: get_logger().info( "Last tester finished. Terminating") finalized = True break else: raise RuntimeError( "Test worker {} abnormally terminated". format(package[1] - 1)) else: get_logger().error( f"Runner received invalid package tuple {package}" ) except queue.Empty as _: if all(p.exitcode is not None for p in itertools.chain(*self.processes.values())): break except KeyboardInterrupt: get_logger().info("KeyboardInterrupt. Terminating runner.") except Exception: get_logger().error("Encountered Exception. Terminating runner.") get_logger().exception(traceback.format_exc()) finally: if finalized: get_logger().info("Done") if log_writer is not None: log_writer.close() self.close() return test_results
def make_tensorboard_summary(self): all_experiments = list(self.experiment_to_train_events_paths_map.keys()) for experiment_name in all_experiments: summary_writer = SummaryWriter( os.path.join(self.tensorboard_output_summary_folder, experiment_name) ) test_labels = ( sorted(list(self.test_data[experiment_name].keys())) if len(self.test_data) > 0 else [] ) for test_label in test_labels: train_label = test_label.replace("valid", "test").replace( "test", "train" ) if train_label not in self.train_data[experiment_name]: print( f"Missing matching 'train' label {train_label} for eval label {test_label}. Skipping" ) continue train_data = self.train_data[experiment_name][train_label] test_data = self.test_data[experiment_name][test_label] scores, times, steps = self._eval_vs_train_time_steps( test_data, train_data ) for score, t, step in zip(scores, times, steps): summary_writer.add_scalar( test_label, score, global_step=step, walltime=t ) valid_labels = sorted( [ key for key in list(self.train_data[experiment_name].keys()) if "valid" in key ] ) for valid_label in valid_labels: train_label = valid_label.replace("valid", "train") assert ( train_label in self.train_data[experiment_name] ), f"Missing matching 'train' label {train_label} for valid label {valid_label}" train_data = self.train_data[experiment_name][train_label] valid_data = self.train_data[experiment_name][valid_label] scores, times, steps = self._eval_vs_train_time_steps( valid_data, train_data ) for score, t, step in zip(scores, times, steps): summary_writer.add_scalar( valid_label, score, global_step=step, walltime=t ) train_labels = sorted( [ key for key in list(self.train_data[experiment_name].keys()) if "train" in key ] ) for train_label in train_labels: scores, times, steps = self._train_vs_time_steps( self.train_data[experiment_name][train_label] ) for score, t, step in zip(scores, times, steps): summary_writer.add_scalar( train_label, score, global_step=step, walltime=t ) summary_writer.close()