def _handle_completion(self, cluster): result = Status.GOOD self._results = ResultsAggregator.list_results(self._output) if len(self._results) != self._config.get_num_jobs(): finished_jobs = {x.name for x in self._results} all_jobs = {x.name for x in self._config.iter_jobs()} missing_jobs = sorted(all_jobs.difference(finished_jobs)) logger.error( "Error in result totals. num_results=%s total_num_jobs=%s", len(self._results), self._config.get_num_jobs(), ) logger.error( "These jobs did not finish: %s. Check for process crashes or HPC timeouts.", missing_jobs, ) result = Status.ERROR else: missing_jobs = [] self.write_results_summary(RESULTS_FILE, missing_jobs) self._log_error_log_messages(self._output) bytes_consumed = get_directory_size_bytes(self._output, recursive=False) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="main output directory size", bytes_consumed=bytes_consumed, ) log_event(event) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission completed", num_jobs=self.get_num_jobs(), ) log_event(event) group = self._config.get_default_submission_group() if group.submitter_params.generate_reports: self.generate_reports(self._output, group.submitter_params.resource_monitor_type) cluster.mark_complete() if cluster.config.pipeline_stage_num is not None: # The pipeline directory must be the one above this one. pipeline_dir = os.path.dirname(self._output) next_stage = cluster.config.pipeline_stage_num + 1 cmd = (f"jade pipeline submit-next-stage {pipeline_dir} " f"--stage-num={next_stage} " f"--return-code={result.value}") run_command(cmd) return result
def run(self): assert self._submission_group is not None sing_params = self._submission_group.submitter_params.singularity_params if sing_params and sing_params.enabled: script = self._make_singularity_command() else: script = self._run_script job_id, result = self._mgr.submit( self._output, self._name, str(script), self._submission_group.name, dry_run=self._dry_run, ) if result != Status.GOOD: # TODO: cancel or fail all jobs in the batch logger.error("Failed to submit name=%s", self._name) self._return_code = 1 self._is_complete = True return Status.ERROR self._job_id = job_id event = StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_JOB_ASSIGNED, message="HPC job assigned", job_id=self._job_id, ) log_event(event) logger.info("Assigned job_ID=%s name=%s", self._job_id, self._name) return Status.GOOD
def _complete(self): ret = self._pipe.returncode exec_time_s = time.time() - self._start_time job_filename = self._job.name illegal_chars = ("/", "\\", ":") for char in illegal_chars: job_filename = job_filename.replace(char, "-") status = "finished" output_dir = os.path.join(self._output, JOBS_OUTPUT_DIR, self._job.name) bytes_consumed = get_directory_size_bytes(output_dir) event = StructuredLogEvent( source=self._job.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="job output directory size", bytes_consumed=bytes_consumed, ) log_event(event) result = Result(self._job.name, ret, status, exec_time_s) ResultsAggregator.append(self._results_filename, result) logger.info("Job %s completed return_code=%s exec_time_s=%s", self._job.name, ret, exec_time_s)
def log_disk_stats(self): """Logs disk stats.""" data = psutil.disk_io_counters() stats = { "elapsed_seconds": time.time() - self._last_disk_check_time, } for stat in self.DISK_STATS: stats[stat] = getattr(data, stat, 0) - getattr(self, stat, 0) stats["read MB/s"] = self._mb_per_sec(stats["read_bytes"], stats["elapsed_seconds"]) stats["write MB/s"] = self._mb_per_sec(stats["write_bytes"], stats["elapsed_seconds"]) stats["read IOPS"] = float( stats["read_count"]) / stats["elapsed_seconds"] stats["write IOPS"] = float( stats["write_count"]) / stats["elapsed_seconds"] self._update_disk_stats(data) log_event( StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_DISK_STATS, message="Node disk stats update", **stats, ))
def _log_submission_event(self, submission_group, batch): event = StructuredLogEvent( source=submission_group.submitter_params.hpc_config.job_prefix, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_SUBMIT, message="Submitted HPC batch", batch_size=batch.num_jobs, per_node_batch_size=submission_group.submitter_params. per_node_batch_size, ) log_event(event)
def log_memory_stats(self): """Logs memory resource stats information.""" mem_stats = psutil.virtual_memory()._asdict() log_event( StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_MEMORY_STATS, message="Node memory stats update", **mem_stats, ))
def log_cpu_stats(self): """Logs CPU resource stats information.""" cpu_stats = self._monitor.get_cpu_stats() log_event( StructuredLogEvent( source=self.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_CPU_STATS, message="Node CPU stats update", **cpu_stats, ) )
def log_disk_stats(self): """Logs disk stats.""" stats = self._monitor.get_disk_stats() log_event( StructuredLogEvent( source=self.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_DISK_STATS, message="Node disk stats update", **stats, ) )
def log_memory_stats(self): """Logs memory resource stats information.""" mem_stats = self._monitor.get_memory_stats() log_event( StructuredLogEvent( source=self.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_MEMORY_STATS, message="Node memory stats update", **mem_stats, ) )
def log_network_stats(self): """Logs memory resource stats information.""" stats = self._monitor.get_network_stats() log_event( StructuredLogEvent( source=self.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_NETWORK_STATS, message="Node net stats update", **stats, ) )
def test_structured_event__create(): """Test class initialization and methods""" event = StructuredLogEvent(source="job_1", category="ParameterError", name="test-error", message="Something happens", country="Canada", foo="foo info", bar="bar info") assert "timestamp" in str(event) assert "source" in str(event) assert "foo" in event.data assert "bar" in event.data
def log_cpu_stats(self): """Logs CPU resource stats information.""" cpu_stats = psutil.cpu_times_percent()._asdict() cpu_stats["cpu_percent"] = psutil.cpu_percent() log_event( StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_CPU_STATS, message="Node CPU stats update", **cpu_stats, ))
def run(self): job_id, result = self._mgr.submit(self._output, self._name, self._run_script) self._is_pending = True if result != Status.GOOD: raise ExecutionError("Failed to submit name={self._name}") self._job_id = job_id event = StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_JOB_ASSIGNED, message="HPC job assigned", job_id=self._job_id, ) log_event(event) logger.info("Assigned job_ID=%s name=%s", self._job_id, self._name)
def find_error_log_messages(directory): """Parse output log files for error messages Parameters ---------- directory : str output directory """ substrings = ( "DUE TO TIME LIMIT", # includes slurmstepd, but check this first "srun", "slurmstepd", "Traceback", ) filenames = [ os.path.join(directory, x) for x in os.listdir(directory) if x.endswith(".e") ] if not filenames: return for line in fileinput.input(filenames): for substring in substrings: if substring in line: event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_ERROR, name=EVENT_NAME_ERROR_LOG, message="Detected error message in log.", error=substring, filename=fileinput.filename(), line_number=fileinput.lineno(), text=line.strip(), ) yield event # Only find one match in a single line. break
def log_network_stats(self): """Logs memory resource stats information.""" data = psutil.net_io_counters() stats = { "elapsed_seconds": time.time() - self._last_net_check_time, } for stat in self.NET_STATS: stats[stat] = getattr(data, stat, 0) - getattr(self, stat, 0) stats["recv MB/s"] = self._mb_per_sec(stats["bytes_recv"], stats["elapsed_seconds"]) stats["sent MB/s"] = self._mb_per_sec(stats["bytes_sent"], stats["elapsed_seconds"]) self._update_net_stats(data) log_event( StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_NETWORK_STATS, message="Node net stats update", **stats, ))
def is_complete(self): status = self._mgr.check_status(job_id=self._job_id) if status != self._last_status: logger.info("Submission %s %s changed status from %s to %s", self._name, self._job_id, self._last_status, status) event = StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_JOB_STATE_CHANGE, message="HPC job state change", job_id=self._job_id, old_state=self._last_status.value, new_state=status.value, ) log_event(event) self._last_status = status if status in (HpcJobStatus.COMPLETE, HpcJobStatus.NONE): self._is_pending = False return not self._is_pending
def _complete(self): self._return_code = self._pipe.returncode exec_time_s = time.time() - self._start_time if not self._is_manager_node: # This will happen on a multi-node job. Don't complete it multiple times. logger.info( "Job %s completed on non-manager node return_code=%s exec_time_s=%s", self._job.name, self._return_code, exec_time_s, ) return status = JobCompletionStatus.FINISHED output_dir = self._output / JOBS_OUTPUT_DIR / self._job.name bytes_consumed = get_directory_size_bytes(output_dir) event = StructuredLogEvent( source=self._job.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="job output directory size", bytes_consumed=bytes_consumed, ) log_event(event) result = Result( self._job.name, self._return_code, status, exec_time_s, hpc_job_id=self._hpc_job_id ) ResultsAggregator.append(self._output, result, batch_id=self._batch_id) logger.info( "Job %s completed return_code=%s exec_time_s=%s hpc_job_id=%s", self._job.name, self._return_code, exec_time_s, self._hpc_job_id, )
def run(self, output, queue_depth, per_node_batch_size, num_processes, poll_interval=60, try_add_blocked_jobs=False, verbose=False): """Run all jobs defined in the configuration on the HPC.""" queue = JobQueue(queue_depth, poll_interval=poll_interval) jobs = list(self._config.iter_jobs()) while jobs: self._update_completed_jobs(jobs) batch = _BatchJobs() jobs_to_pop = [] num_blocked = 0 for i, job in enumerate(jobs): if batch.is_job_blocked(job, try_add_blocked_jobs): num_blocked += 1 else: batch.append(job) jobs_to_pop.append(i) if batch.num_jobs >= per_node_batch_size: break if batch.num_jobs > 0: async_submitter = self._make_async_submitter( batch.serialize(), num_processes, output, verbose, ) queue.submit(async_submitter) # It might be better to delay submission for a limited number # of rounds if there are blocked jobs and the batch isn't full. # We can look at these events on our runs to see how this # logic is working with our jobs. event = StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_SUBMIT, message="Submitted HPC batch", batch_size=batch.num_jobs, num_blocked=num_blocked, per_node_batch_size=per_node_batch_size, ) log_event(event) for i in reversed(jobs_to_pop): jobs.pop(i) else: logger.debug("No jobs are ready for submission") logger.debug("num_submitted=%s num_blocked=%s", batch.num_jobs, num_blocked) if batch.num_jobs > 0 and not queue.is_full(): # Keep submitting. continue # TODO: this will cause up to <queue_depth> slurm status commands # every poll. We could send one command, get all statuses, and # share it among the submitters. queue.process_queue() time.sleep(poll_interval) queue.wait()
def submit_jobs(self, name="job", per_node_batch_size=DEFAULTS["per_node_batch_size"], max_nodes=DEFAULTS["max_nodes"], force_local=False, verbose=False, poll_interval=DEFAULTS["poll_interval"], num_processes=None, previous_results=None, reports=True, try_add_blocked_jobs=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- name : str batch name, applies to HPC job submission only per_node_batch_size : int Number of jobs to run on one node in one batch. max_nodes : int Max number of node submission requests to make in parallel. force_local : bool If on HPC, run jobs through subprocess as if local. wait : bool Don't return until HPC jobs have finished. verbose : bool Enable debug logging. poll_interval : int Inteval in seconds on which to poll jobs. num_processes : int Number of processes to run in parallel; defaults to num CPUs Returns ------- Status """ logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) self._config.check_job_dependencies() self._hpc = HpcManager(self._hpc_config_file, self._output) result = Status.GOOD # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) start_time = time.time() if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) result = runner.run_jobs(verbose=verbose, num_processes=num_processes) else: self._submit_to_hpc(name, max_nodes, per_node_batch_size, verbose, poll_interval, num_processes, try_add_blocked_jobs) results_summary = ResultsAggregatorSummary(self._results_dir) self._results = results_summary.get_results() if len(self._results) != self._config.get_num_jobs(): logger.error( "Number of results doesn't match number of jobs: " "results=%s jobs=%s. Check for process crashes " "or HPC timeouts.", len(self._results), self._config.get_num_jobs()) result = Status.ERROR if previous_results: self._results += previous_results self.write_results(RESULTS_FILE) results_summary.delete_files() shutil.rmtree(self._results_dir) self._log_error_log_messages(self._output) bytes_consumed = get_directory_size_bytes(self._output, recursive=False) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="main output directory size", bytes_consumed=bytes_consumed, ) log_event(event) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_CONFIG_EXEC_SUMMARY, message="config execution summary", config_execution_time=time.time() - start_time, num_jobs=self.get_num_jobs(), ) log_event(event) if reports: self.generate_reports(self._output) return result
def submit_jobs(self, cluster, force_local=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- cluster : Cluster force_local : bool If on HPC, run jobs through subprocess as if local. Returns ------- Status """ if self._is_new: logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) ResultsAggregator.create(self._output) # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission started", num_jobs=self.get_num_jobs(), ) log_event(event) os.environ["JADE_RUNTIME_OUTPUT"] = self._output if self._config.setup_command is not None: cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}" logger.info("Running setup command: %s", cmd) check_run_command(self._config.setup_command) else: self._handle_submission_groups() result = Status.IN_PROGRESS group = self._config.get_default_submission_group() groups = make_submission_group_lookup(cluster.config.submission_groups) self._hpc = HpcManager(groups, self._output) if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) num_processes = group.submitter_params.num_processes verbose = group.submitter_params.verbose result = runner.run_jobs(verbose=verbose, num_processes=num_processes) agg = ResultsAggregator.load(self._output) agg.process_results() is_complete = True else: is_complete = self._submit_to_hpc(cluster) if is_complete: result = self._handle_completion(cluster) return result