def _log_and_record(self, elapsed_steps: int, elapsed_time: float, global_step: int): if cluster.is_chief(): steps_per_sec = elapsed_steps / elapsed_time mlflow.log_metric(f"steps_per_sec_{cluster.n_try()}", steps_per_sec, step=global_step)
def _handle_events( events: Dict[str, Dict[str, str]], n_try: int ) -> Tuple[str, metrics.Metrics, ContainerLogStatus]: header = [] details = [] min_training_start_time = timedelta.max max_training_stop_time = timedelta.min min_eval_start_time = timedelta.max max_eval_stop_time = timedelta.min valid_training_time = True valid_eval_time = True container_duration: Dict[str, Optional[timedelta]] = dict() train_eval_time_per_node: Dict[str, Optional[timedelta]] = dict() container_log_urls: Dict[str, str] = dict() container_status: Dict[str, str] = dict() for task, stages in sorted(events.items()): if "stop" in stages: status = "FAILED" if stages["stop"] else "SUCCEEDED" elif stages: status = "KILLED" else: # No events -- container was never started. status = "REQUESTED" sock_addr = stages.get("init", "") exception = stages.get("stop", "") logs = stages.get("logs", "") container_log_urls[task] = logs container_status[task] = status container_duration[task] = None if 'container_start_time' in stages and 'container_stop_time' in stages: container_duration[task] = timedelta(seconds=(float(stages['container_stop_time']) - float(stages['container_start_time']))) train_eval_time_per_node[task] = None task_type = cluster.get_task_type(task) if 'train_eval_start_time' in stages and 'train_eval_stop_time' in stages and not exception: start_time = timedelta(seconds=float(stages['train_eval_start_time'])) stop_time = timedelta(seconds=float(stages['train_eval_stop_time'])) train_eval_time_per_node[task] = stop_time - start_time if cluster.is_worker(task_type) or cluster.is_chief(task_type): if start_time < min_training_start_time: min_training_start_time = start_time if stop_time > max_training_stop_time: max_training_stop_time = stop_time elif cluster.is_evaluator(task_type): if start_time < min_eval_start_time: min_eval_start_time = start_time if stop_time > max_eval_stop_time: max_eval_stop_time = stop_time else: if cluster.is_worker(task_type) or cluster.is_chief(task_type): valid_training_time = False elif cluster.is_evaluator(task_type): valid_eval_time = False header.append(f"{task:>16} {sock_addr} {status} {logs}" f" Container duration: {container_duration[task]}" f" Training/evaluation duration : {train_eval_time_per_node[task]}") if exception: details.append(f"Exception in task {task}:") details.append(exception) training_time = max_training_stop_time - min_training_start_time\ if valid_training_time and min_training_start_time < timedelta.max else None eval_time = max_eval_stop_time - min_eval_start_time\ if valid_eval_time and min_eval_start_time < timedelta.max else None header.append(f'Training time = {training_time}') header.append(f'Evaluation time = {eval_time}') result_metrics = metrics.Metrics( training_time, eval_time, container_duration, train_eval_time_per_node ) return ((os.linesep + os.linesep.join(header) + os.linesep * (1 + bool(details)) + os.linesep.join(details)), result_metrics, ContainerLogStatus(container_log_urls, container_status))