Esempio n. 1
0
    def check_status(self, name=None, job_id=None):
        field_names = ("jobid", "name", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"
        if name is not None:
            cmd += f" -n {name}"
        elif job_id is not None:
            cmd += f" -j {job_id}"
        else:
            # Mutual exclusivity should be handled in HpcManager.
            assert False

        output = {}
        ret = run_command(cmd, output)
        if ret != 0:
            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        stdout = output["stdout"]
        logger.debug("squeue output:  [%s]", stdout)
        fields = stdout.split()
        if not fields:
            # No jobs are currently running.
            return HpcJobInfo("", "", HpcJobStatus.NONE)

        assert len(fields) == len(field_names)
        job_info = HpcJobInfo(fields[0],
                              fields[1],
                              self._STATUSES.get(fields[2],
                                                 HpcJobStatus.UNKNOWN))
        return job_info
Esempio n. 2
0
    def _submit_next_stage(self, stage_num, return_code=None):
        if return_code is None:
            assert stage_num == 1, str(stage_num)
        else:
            if stage_num != self.stage_num + 1:
                raise InvalidParameter(
                    f"expected stage_num {self.stage_num + 1}, received {stage_num}"
                )

            self._config.stages[stage_num - 2].return_code = return_code
            self._config.stage_num += 1

        if self._config.stage_num == len(self._config.stages) + 1:
            logger.info("Pipeline is complete")
            self._config.is_complete = True
            self._serialize()
            return

        logger.info("Start execution pipeline stage %s/%s", stage_num,
                    len(self._config.stages))

        self._serialize()
        stage = self._config.stages[self.stage_num - 1]
        os.environ["JADE_PIPELINE_STAGE_ID"] = str(self.stage_num)
        self._run_auto_config(stage)
        output = self.get_stage_output_path(self.path, self.stage_num)
        ret = JobSubmitter.run_submit_jobs(
            stage.config_file,
            output,
            stage.submitter_params,
            pipeline_stage_num=self.stage_num,
        )
        if ret != 0:
            raise ExecutionError(f"stage {self.stage_num} failed")
Esempio n. 3
0
def get_successful_result(results_file, job_name):
    """Return the job result from the results file.

    Parameters
    ----------
    results_file : str
    job_name : str

    Returns
    -------
    dict

    Raises
    ------
    InvalidParameter
        Raised if job_name is not found.
    ExecutionError
        Raised if the result was not successful.

    """
    result = get_result(results_file, job_name)
    if result.return_code != 0 or result.status != "finished":
        raise ExecutionError(f"result was not successful: {result}")

    return result
Esempio n. 4
0
def get_successful_result(results_file, job_name):
    """Return the job result from the results file.

    Parameters
    ----------
    results_file : str
    job_name : str

    Returns
    -------
    dict

    Raises
    ------
    InvalidParameter
        Raised if job_name is not found.
    ExecutionError
        Raised if the result was not successful.

    """
    result = get_result(results_file, job_name)
    if not result.is_successful():
        raise ExecutionError(f"result was not successful: {result}")

    return result
Esempio n. 5
0
    def _submit(self, verbose):
        for stage in self._stages:
            os.environ["JADE_PIPELINE_STAGE_ID"] = str(self._cur_stage_id)
            stage_info = {
                "stage_id":
                self._cur_stage_id,
                "output_directory":
                self.get_stage_output_path(self._output, self._cur_stage_id)
            }
            self._status_info["stages"].append(stage_info)
            self._dump_status()
            self._run_auto_config(stage)
            cmd = self._make_submit_cmd(stage, verbose)
            start = time.time()
            ret = run_command(cmd)
            end = time.time()
            exec_time = end - start
            result = Result(str(self._cur_stage_id), ret, "finished",
                            exec_time, end)
            self._status_info["stages"][-1]["result"] = serialize_result(
                result)
            self._dump_status()
            if ret != 0:
                raise ExecutionError(f"stage {self._cur_stage_id} failed")
            self._cur_stage_id += 1

        logger.info("Finished execution pipeline")
Esempio n. 6
0
    def _run_auto_config(self, stage):
        if os.path.exists(stage.config_file):
            os.remove(stage.config_file)

        ret = run_command(stage.auto_config_cmd)
        if ret != 0:
            raise ExecutionError(
                f"Failed to auto-config stage {self.stage_num}: {ret}")

        if not os.path.exists(stage.config_file):
            raise ExecutionError(
                f"auto-config stage {self.stage_num} did not produce {stage.config_file}"
            )

        final_file = self.get_stage_config_file_path(self._output,
                                                     self.stage_num)
        shutil.copyfile(stage.config_file, final_file)
        stage.config_file = final_file
Esempio n. 7
0
    def _submit_request(self, cmd, *args):
        if not cmd.endswith("/"):
            cmd += "/"
        if args:
            cmd = cmd + "/".join(args)
        logger.info("Submitting %s", cmd)
        response = requests.get(cmd)
        if response.status_code != 200:
            raise ExecutionError(f"{cmd} failed: status_code={response.status_code}")

        return response.json()
Esempio n. 8
0
    def _run_auto_config(self, stage):
        config_file = stage["config_file"]
        if os.path.exists(config_file):
            os.remove(config_file)

        auto_config_cmd = stage["auto_config_cmd"]
        ret = run_command(auto_config_cmd)
        if ret != 0:
            raise ExecutionError(
                f"Failed to auto-config stage {self._cur_stage_id}: {ret}")

        if not os.path.exists(config_file):
            raise ExecutionError(
                f"auto-config stage {self._cur_stage_id} did not produce {config_file}"
            )

        final_file = self.get_stage_config_file_path(self._output,
                                                     self._cur_stage_id)
        shutil.move(config_file, final_file)
        stage["config_file"] = final_file
Esempio n. 9
0
def check_run_command(*args, **kwargs):
    """Same as run_command except that it raises an exception on failure.

    Raises
    ------
    ExecutionError
        Raised if the command returns a non-zero return code.

    """
    ret = run_command(*args, **kwargs)
    if ret != 0:
        raise ExecutionError(f"command returned error code: {ret}")
Esempio n. 10
0
    def check_statuses(self):
        field_names = ("jobid", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"

        output = {}
        # Transient failures could be costly. Retry for up to one minute.
        ret = run_command(cmd, output, num_retries=6, retry_delay_s=10)
        if ret != 0:
            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        return self._get_statuses_from_output(output["stdout"])
Esempio n. 11
0
    def _run_command(self, cmd):
        orig = os.getcwd()
        os.chdir(self._path)

        try:
            output = {}
            ret = run_command(cmd, output=output)
            if ret != 0:
                raise ExecutionError(
                    f"[{cmd}] failed: {ret}: {output['stderr']}")

            return output["stdout"].strip()
        finally:
            os.chdir(orig)
Esempio n. 12
0
    def run(self):
        job_id, result = self._mgr.submit(self._output, self._name,
                                          self._run_script)
        self._is_pending = True
        if result != Status.GOOD:
            raise ExecutionError("Failed to submit name={self._name}")

        self._job_id = job_id
        event = StructuredLogEvent(
            source=self._name,
            category=EVENT_CATEGORY_HPC,
            name=EVENT_NAME_HPC_JOB_ASSIGNED,
            message="HPC job assigned",
            job_id=self._job_id,
        )
        log_event(event)
        logger.info("Assigned job_ID=%s name=%s", self._job_id, self._name)
Esempio n. 13
0
    def check_status(self, name=None, job_id=None):
        field_names = ("jobid", "name", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"
        if name is not None:
            cmd += f" -n {name}"
        elif job_id is not None:
            cmd += f" -j {job_id}"
        else:
            # Mutual exclusivity should be handled in HpcManager.
            assert False

        output = {}
        # Transient failures could be costly. Retry for up to one minute.
        errors = ["Invalid job id specified"]
        ret = run_command(cmd,
                          output,
                          num_retries=6,
                          retry_delay_s=10,
                          error_strings=errors)
        if ret != 0:
            if "Invalid job id specified" in output["stderr"]:
                return HpcJobInfo("", "", HpcJobStatus.NONE)

            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        stdout = output["stdout"]
        logger.debug("squeue output:  [%s]", stdout)
        fields = stdout.split()
        if not fields:
            # No jobs are currently running.
            return HpcJobInfo("", "", HpcJobStatus.NONE)

        assert len(fields) == len(field_names)
        job_info = HpcJobInfo(
            fields[0], fields[1],
            self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN))
        return job_info
Esempio n. 14
0
    def get_successful_result(self, job_name):
        """Return the successful job result from the results
        Parameters
        ----------
        job_name : str
        Returns
        -------
        dict
        Raises
        ------
        InvalidParameter
            Raised if job_name is not found.
        ExecutionError
            Raised if the result was not successful.
        """
        result = self.get_result(job_name)
        if result is None:
            raise InvalidParameter(f"result not found {job_name}")

        if result.return_code != 0 or result.status != "finished":
            raise ExecutionError(f"result wasn't successful: {result}")

        return result