コード例 #1
0
    def _handle_completion(self, cluster):
        result = Status.GOOD
        self._results = ResultsAggregator.list_results(self._output)
        if len(self._results) != self._config.get_num_jobs():
            finished_jobs = {x.name for x in self._results}
            all_jobs = {x.name for x in self._config.iter_jobs()}
            missing_jobs = sorted(all_jobs.difference(finished_jobs))
            logger.error(
                "Error in result totals. num_results=%s total_num_jobs=%s",
                len(self._results),
                self._config.get_num_jobs(),
            )
            logger.error(
                "These jobs did not finish: %s. Check for process crashes or HPC timeouts.",
                missing_jobs,
            )
            result = Status.ERROR
        else:
            missing_jobs = []

        self.write_results_summary(RESULTS_FILE, missing_jobs)
        self._log_error_log_messages(self._output)

        bytes_consumed = get_directory_size_bytes(self._output,
                                                  recursive=False)
        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="main output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)

        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_SUBMIT_COMPLETED,
            message="job submission completed",
            num_jobs=self.get_num_jobs(),
        )
        log_event(event)

        group = self._config.get_default_submission_group()
        if group.submitter_params.generate_reports:
            self.generate_reports(self._output,
                                  group.submitter_params.resource_monitor_type)
        cluster.mark_complete()

        if cluster.config.pipeline_stage_num is not None:
            # The pipeline directory must be the one above this one.
            pipeline_dir = os.path.dirname(self._output)
            next_stage = cluster.config.pipeline_stage_num + 1
            cmd = (f"jade pipeline submit-next-stage {pipeline_dir} "
                   f"--stage-num={next_stage} "
                   f"--return-code={result.value}")
            run_command(cmd)

        return result
コード例 #2
0
ファイル: hpc_submitter.py プロジェクト: NREL/jade
    def run(self):
        assert self._submission_group is not None
        sing_params = self._submission_group.submitter_params.singularity_params
        if sing_params and sing_params.enabled:
            script = self._make_singularity_command()
        else:
            script = self._run_script
        job_id, result = self._mgr.submit(
            self._output,
            self._name,
            str(script),
            self._submission_group.name,
            dry_run=self._dry_run,
        )
        if result != Status.GOOD:
            # TODO: cancel or fail all jobs in the batch
            logger.error("Failed to submit name=%s", self._name)
            self._return_code = 1
            self._is_complete = True
            return Status.ERROR

        self._job_id = job_id
        event = StructuredLogEvent(
            source=self._name,
            category=EVENT_CATEGORY_HPC,
            name=EVENT_NAME_HPC_JOB_ASSIGNED,
            message="HPC job assigned",
            job_id=self._job_id,
        )
        log_event(event)
        logger.info("Assigned job_ID=%s name=%s", self._job_id, self._name)
        return Status.GOOD
コード例 #3
0
ファイル: dispatchable_job.py プロジェクト: daniel-thom/jade
    def _complete(self):
        ret = self._pipe.returncode
        exec_time_s = time.time() - self._start_time

        job_filename = self._job.name
        illegal_chars = ("/", "\\", ":")
        for char in illegal_chars:
            job_filename = job_filename.replace(char, "-")

        status = "finished"
        output_dir = os.path.join(self._output, JOBS_OUTPUT_DIR,
                                  self._job.name)
        bytes_consumed = get_directory_size_bytes(output_dir)
        event = StructuredLogEvent(
            source=self._job.name,
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="job output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)
        result = Result(self._job.name, ret, status, exec_time_s)
        ResultsAggregator.append(self._results_filename, result)

        logger.info("Job %s completed return_code=%s exec_time_s=%s",
                    self._job.name, ret, exec_time_s)
コード例 #4
0
ファイル: resource_monitor.py プロジェクト: daniel-thom/jade
    def log_disk_stats(self):
        """Logs disk stats."""
        data = psutil.disk_io_counters()
        stats = {
            "elapsed_seconds": time.time() - self._last_disk_check_time,
        }
        for stat in self.DISK_STATS:
            stats[stat] = getattr(data, stat, 0) - getattr(self, stat, 0)
        stats["read MB/s"] = self._mb_per_sec(stats["read_bytes"],
                                              stats["elapsed_seconds"])
        stats["write MB/s"] = self._mb_per_sec(stats["write_bytes"],
                                               stats["elapsed_seconds"])
        stats["read IOPS"] = float(
            stats["read_count"]) / stats["elapsed_seconds"]
        stats["write IOPS"] = float(
            stats["write_count"]) / stats["elapsed_seconds"]
        self._update_disk_stats(data)

        log_event(
            StructuredLogEvent(
                source=self._name,
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_DISK_STATS,
                message="Node disk stats update",
                **stats,
            ))
コード例 #5
0
ファイル: hpc_submitter.py プロジェクト: NREL/jade
 def _log_submission_event(self, submission_group, batch):
     event = StructuredLogEvent(
         source=submission_group.submitter_params.hpc_config.job_prefix,
         category=EVENT_CATEGORY_HPC,
         name=EVENT_NAME_HPC_SUBMIT,
         message="Submitted HPC batch",
         batch_size=batch.num_jobs,
         per_node_batch_size=submission_group.submitter_params.
         per_node_batch_size,
     )
     log_event(event)
コード例 #6
0
ファイル: resource_monitor.py プロジェクト: daniel-thom/jade
 def log_memory_stats(self):
     """Logs memory resource stats information."""
     mem_stats = psutil.virtual_memory()._asdict()
     log_event(
         StructuredLogEvent(
             source=self._name,
             category=EVENT_CATEGORY_RESOURCE_UTIL,
             name=EVENT_NAME_MEMORY_STATS,
             message="Node memory stats update",
             **mem_stats,
         ))
コード例 #7
0
ファイル: resource_monitor.py プロジェクト: jgu2/jade
 def log_cpu_stats(self):
     """Logs CPU resource stats information."""
     cpu_stats = self._monitor.get_cpu_stats()
     log_event(
         StructuredLogEvent(
             source=self.name,
             category=EVENT_CATEGORY_RESOURCE_UTIL,
             name=EVENT_NAME_CPU_STATS,
             message="Node CPU stats update",
             **cpu_stats,
         )
     )
コード例 #8
0
ファイル: resource_monitor.py プロジェクト: jgu2/jade
 def log_disk_stats(self):
     """Logs disk stats."""
     stats = self._monitor.get_disk_stats()
     log_event(
         StructuredLogEvent(
             source=self.name,
             category=EVENT_CATEGORY_RESOURCE_UTIL,
             name=EVENT_NAME_DISK_STATS,
             message="Node disk stats update",
             **stats,
         )
     )
コード例 #9
0
ファイル: resource_monitor.py プロジェクト: jgu2/jade
 def log_memory_stats(self):
     """Logs memory resource stats information."""
     mem_stats = self._monitor.get_memory_stats()
     log_event(
         StructuredLogEvent(
             source=self.name,
             category=EVENT_CATEGORY_RESOURCE_UTIL,
             name=EVENT_NAME_MEMORY_STATS,
             message="Node memory stats update",
             **mem_stats,
         )
     )
コード例 #10
0
ファイル: resource_monitor.py プロジェクト: jgu2/jade
 def log_network_stats(self):
     """Logs memory resource stats information."""
     stats = self._monitor.get_network_stats()
     log_event(
         StructuredLogEvent(
             source=self.name,
             category=EVENT_CATEGORY_RESOURCE_UTIL,
             name=EVENT_NAME_NETWORK_STATS,
             message="Node net stats update",
             **stats,
         )
     )
コード例 #11
0
def test_structured_event__create():
    """Test class initialization and methods"""
    event = StructuredLogEvent(source="job_1",
                               category="ParameterError",
                               name="test-error",
                               message="Something happens",
                               country="Canada",
                               foo="foo info",
                               bar="bar info")
    assert "timestamp" in str(event)
    assert "source" in str(event)
    assert "foo" in event.data
    assert "bar" in event.data
コード例 #12
0
ファイル: resource_monitor.py プロジェクト: daniel-thom/jade
    def log_cpu_stats(self):
        """Logs CPU resource stats information."""
        cpu_stats = psutil.cpu_times_percent()._asdict()
        cpu_stats["cpu_percent"] = psutil.cpu_percent()

        log_event(
            StructuredLogEvent(
                source=self._name,
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_CPU_STATS,
                message="Node CPU stats update",
                **cpu_stats,
            ))
コード例 #13
0
ファイル: hpc_submitter.py プロジェクト: daniel-thom/jade
    def run(self):
        job_id, result = self._mgr.submit(self._output, self._name,
                                          self._run_script)
        self._is_pending = True
        if result != Status.GOOD:
            raise ExecutionError("Failed to submit name={self._name}")

        self._job_id = job_id
        event = StructuredLogEvent(
            source=self._name,
            category=EVENT_CATEGORY_HPC,
            name=EVENT_NAME_HPC_JOB_ASSIGNED,
            message="HPC job assigned",
            job_id=self._job_id,
        )
        log_event(event)
        logger.info("Assigned job_ID=%s name=%s", self._job_id, self._name)
コード例 #14
0
    def find_error_log_messages(directory):
        """Parse output log files for error messages

        Parameters
        ----------
        directory : str
            output directory

        """
        substrings = (
            "DUE TO TIME LIMIT",  # includes slurmstepd, but check this first
            "srun",
            "slurmstepd",
            "Traceback",
        )

        filenames = [
            os.path.join(directory, x) for x in os.listdir(directory)
            if x.endswith(".e")
        ]

        if not filenames:
            return

        for line in fileinput.input(filenames):
            for substring in substrings:
                if substring in line:
                    event = StructuredLogEvent(
                        source="submitter",
                        category=EVENT_CATEGORY_ERROR,
                        name=EVENT_NAME_ERROR_LOG,
                        message="Detected error message in log.",
                        error=substring,
                        filename=fileinput.filename(),
                        line_number=fileinput.lineno(),
                        text=line.strip(),
                    )
                    yield event
                    # Only find one match in a single line.
                    break
コード例 #15
0
ファイル: resource_monitor.py プロジェクト: daniel-thom/jade
    def log_network_stats(self):
        """Logs memory resource stats information."""
        data = psutil.net_io_counters()
        stats = {
            "elapsed_seconds": time.time() - self._last_net_check_time,
        }
        for stat in self.NET_STATS:
            stats[stat] = getattr(data, stat, 0) - getattr(self, stat, 0)
        stats["recv MB/s"] = self._mb_per_sec(stats["bytes_recv"],
                                              stats["elapsed_seconds"])
        stats["sent MB/s"] = self._mb_per_sec(stats["bytes_sent"],
                                              stats["elapsed_seconds"])
        self._update_net_stats(data)

        log_event(
            StructuredLogEvent(
                source=self._name,
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_NETWORK_STATS,
                message="Node net stats update",
                **stats,
            ))
コード例 #16
0
ファイル: hpc_submitter.py プロジェクト: daniel-thom/jade
    def is_complete(self):
        status = self._mgr.check_status(job_id=self._job_id)

        if status != self._last_status:
            logger.info("Submission %s %s changed status from %s to %s",
                        self._name, self._job_id, self._last_status, status)
            event = StructuredLogEvent(
                source=self._name,
                category=EVENT_CATEGORY_HPC,
                name=EVENT_NAME_HPC_JOB_STATE_CHANGE,
                message="HPC job state change",
                job_id=self._job_id,
                old_state=self._last_status.value,
                new_state=status.value,
            )
            log_event(event)
            self._last_status = status

        if status in (HpcJobStatus.COMPLETE, HpcJobStatus.NONE):
            self._is_pending = False

        return not self._is_pending
コード例 #17
0
ファイル: async_cli_command.py プロジェクト: NREL/jade
    def _complete(self):
        self._return_code = self._pipe.returncode
        exec_time_s = time.time() - self._start_time

        if not self._is_manager_node:
            # This will happen on a multi-node job. Don't complete it multiple times.
            logger.info(
                "Job %s completed on non-manager node return_code=%s exec_time_s=%s",
                self._job.name,
                self._return_code,
                exec_time_s,
            )
            return

        status = JobCompletionStatus.FINISHED
        output_dir = self._output / JOBS_OUTPUT_DIR / self._job.name
        bytes_consumed = get_directory_size_bytes(output_dir)
        event = StructuredLogEvent(
            source=self._job.name,
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="job output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)
        result = Result(
            self._job.name, self._return_code, status, exec_time_s, hpc_job_id=self._hpc_job_id
        )
        ResultsAggregator.append(self._output, result, batch_id=self._batch_id)

        logger.info(
            "Job %s completed return_code=%s exec_time_s=%s hpc_job_id=%s",
            self._job.name,
            self._return_code,
            exec_time_s,
            self._hpc_job_id,
        )
コード例 #18
0
ファイル: hpc_submitter.py プロジェクト: daniel-thom/jade
    def run(self,
            output,
            queue_depth,
            per_node_batch_size,
            num_processes,
            poll_interval=60,
            try_add_blocked_jobs=False,
            verbose=False):
        """Run all jobs defined in the configuration on the HPC."""
        queue = JobQueue(queue_depth, poll_interval=poll_interval)
        jobs = list(self._config.iter_jobs())
        while jobs:
            self._update_completed_jobs(jobs)
            batch = _BatchJobs()
            jobs_to_pop = []
            num_blocked = 0
            for i, job in enumerate(jobs):
                if batch.is_job_blocked(job, try_add_blocked_jobs):
                    num_blocked += 1
                else:
                    batch.append(job)
                    jobs_to_pop.append(i)
                    if batch.num_jobs >= per_node_batch_size:
                        break

            if batch.num_jobs > 0:
                async_submitter = self._make_async_submitter(
                    batch.serialize(),
                    num_processes,
                    output,
                    verbose,
                )
                queue.submit(async_submitter)

                # It might be better to delay submission for a limited number
                # of rounds if there are blocked jobs and the batch isn't full.
                # We can look at these events on our runs to see how this
                # logic is working with our jobs.
                event = StructuredLogEvent(
                    source=self._name,
                    category=EVENT_CATEGORY_HPC,
                    name=EVENT_NAME_HPC_SUBMIT,
                    message="Submitted HPC batch",
                    batch_size=batch.num_jobs,
                    num_blocked=num_blocked,
                    per_node_batch_size=per_node_batch_size,
                )
                log_event(event)
                for i in reversed(jobs_to_pop):
                    jobs.pop(i)
            else:
                logger.debug("No jobs are ready for submission")

            logger.debug("num_submitted=%s num_blocked=%s", batch.num_jobs,
                         num_blocked)

            if batch.num_jobs > 0 and not queue.is_full():
                # Keep submitting.
                continue

            # TODO: this will cause up to <queue_depth> slurm status commands
            # every poll.  We could send one command, get all statuses, and
            # share it among the submitters.
            queue.process_queue()
            time.sleep(poll_interval)

        queue.wait()
コード例 #19
0
ファイル: job_submitter.py プロジェクト: daniel-thom/jade
    def submit_jobs(self,
                    name="job",
                    per_node_batch_size=DEFAULTS["per_node_batch_size"],
                    max_nodes=DEFAULTS["max_nodes"],
                    force_local=False,
                    verbose=False,
                    poll_interval=DEFAULTS["poll_interval"],
                    num_processes=None,
                    previous_results=None,
                    reports=True,
                    try_add_blocked_jobs=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        name : str
            batch name, applies to HPC job submission only
        per_node_batch_size : int
            Number of jobs to run on one node in one batch.
        max_nodes : int
            Max number of node submission requests to make in parallel.
        force_local : bool
            If on HPC, run jobs through subprocess as if local.
        wait : bool
            Don't return until HPC jobs have finished.
        verbose : bool
            Enable debug logging.
        poll_interval : int
            Inteval in seconds on which to poll jobs.
        num_processes : int
            Number of processes to run in parallel; defaults to num CPUs

        Returns
        -------
        Status

        """
        logger.info("Submit %s jobs for execution.",
                    self._config.get_num_jobs())
        logger.info("JADE version %s", jade.version.__version__)
        registry = Registry()
        loggers = registry.list_loggers()
        logger.info("Registered modules for logging: %s", ", ".join(loggers))
        self._save_repository_info(registry)

        self._config.check_job_dependencies()

        self._hpc = HpcManager(self._hpc_config_file, self._output)
        result = Status.GOOD

        # If an events summary file exists, it is invalid.
        events_file = os.path.join(self._output, EVENTS_FILENAME)
        if os.path.exists(events_file):
            os.remove(events_file)

        start_time = time.time()
        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
        else:
            self._submit_to_hpc(name, max_nodes, per_node_batch_size, verbose,
                                poll_interval, num_processes,
                                try_add_blocked_jobs)

        results_summary = ResultsAggregatorSummary(self._results_dir)
        self._results = results_summary.get_results()
        if len(self._results) != self._config.get_num_jobs():
            logger.error(
                "Number of results doesn't match number of jobs: "
                "results=%s jobs=%s. Check for process crashes "
                "or HPC timeouts.", len(self._results),
                self._config.get_num_jobs())
            result = Status.ERROR

        if previous_results:
            self._results += previous_results

        self.write_results(RESULTS_FILE)
        results_summary.delete_files()
        shutil.rmtree(self._results_dir)

        self._log_error_log_messages(self._output)

        bytes_consumed = get_directory_size_bytes(self._output,
                                                  recursive=False)
        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_BYTES_CONSUMED,
            message="main output directory size",
            bytes_consumed=bytes_consumed,
        )
        log_event(event)

        event = StructuredLogEvent(
            source="submitter",
            category=EVENT_CATEGORY_RESOURCE_UTIL,
            name=EVENT_NAME_CONFIG_EXEC_SUMMARY,
            message="config execution summary",
            config_execution_time=time.time() - start_time,
            num_jobs=self.get_num_jobs(),
        )
        log_event(event)

        if reports:
            self.generate_reports(self._output)

        return result
コード例 #20
0
    def submit_jobs(self, cluster, force_local=False):
        """Submit simulations. Auto-detect whether the current system is an HPC
        and submit to its queue. Otherwise, run locally.

        Parameters
        ----------
        cluster : Cluster
        force_local : bool
            If on HPC, run jobs through subprocess as if local.

        Returns
        -------
        Status

        """
        if self._is_new:
            logger.info("Submit %s jobs for execution.",
                        self._config.get_num_jobs())
            logger.info("JADE version %s", jade.version.__version__)
            registry = Registry()
            loggers = registry.list_loggers()
            logger.info("Registered modules for logging: %s",
                        ", ".join(loggers))
            self._save_repository_info(registry)

            ResultsAggregator.create(self._output)

            # If an events summary file exists, it is invalid.
            events_file = os.path.join(self._output, EVENTS_FILENAME)
            if os.path.exists(events_file):
                os.remove(events_file)

            event = StructuredLogEvent(
                source="submitter",
                category=EVENT_CATEGORY_RESOURCE_UTIL,
                name=EVENT_NAME_SUBMIT_COMPLETED,
                message="job submission started",
                num_jobs=self.get_num_jobs(),
            )
            log_event(event)

            os.environ["JADE_RUNTIME_OUTPUT"] = self._output
            if self._config.setup_command is not None:
                cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}"
                logger.info("Running setup command: %s", cmd)
                check_run_command(self._config.setup_command)
        else:
            self._handle_submission_groups()

        result = Status.IN_PROGRESS
        group = self._config.get_default_submission_group()
        groups = make_submission_group_lookup(cluster.config.submission_groups)
        self._hpc = HpcManager(groups, self._output)

        if self._hpc.hpc_type == HpcType.LOCAL or force_local:
            runner = JobRunner(self._config_file, output=self._output)
            num_processes = group.submitter_params.num_processes
            verbose = group.submitter_params.verbose
            result = runner.run_jobs(verbose=verbose,
                                     num_processes=num_processes)
            agg = ResultsAggregator.load(self._output)
            agg.process_results()
            is_complete = True
        else:
            is_complete = self._submit_to_hpc(cluster)

        if is_complete:
            result = self._handle_completion(cluster)

        return result