Esempio n. 1
0
    def _execute_subprocess(self,
                            output_name,
                            script_path,
                            cwd,
                            env=None,
                            join_output=False):
        """
        Execute the subprocess script locally.
        If cwd is specified, the submit method will operate outside of the path
        specified by the 'cwd' parameter.
        If env is specified, the submit method will set the environment
        variables for submission to the specified values. The 'env' parameter
        should be a dictionary of environment variables.

        :param output_name: Output name for stdout and stderr (output_name.out). If None, don't write.
        :param script_path: Path to the script to be executed.
        :param cwd: Path to the current working directory.
        :param env: A dict containing a modified environment for execution.
        :param join_output: If True, append stderr to stdout
        :returns: The return code of the submission command and job identifier (SubmissionRecord).
        """
        script_bn = os.path.basename(script_path)
        new_output_name = os.path.splitext(script_bn)[0]
        LOG.debug(
            f"script_path={script_path}, output_name={output_name}, new_output_name={new_output_name}"
        )
        p = start_process(script_path, shell=False, cwd=cwd, env=env)
        pid = p.pid
        output, err = p.communicate()
        retcode = p.wait()

        # This allows us to save on iNodes by not writing the output,
        # or by appending error to output
        if output_name is not None:
            o_path = os.path.join(cwd, "{}.out".format(new_output_name))

            with open(o_path, "a") as out:
                out.write(output)

                if join_output:
                    out.write("\n####### stderr follows #######\n")
                    out.write(err)

            if not join_output:
                e_path = os.path.join(cwd, "{}.err".format(new_output_name))
                with open(e_path, "a") as out:
                    out.write(err)

        if retcode == 0:
            LOG.info("Execution returned status OK.")
            return SubmissionRecord(ReturnCode.OK, retcode, pid)
        else:
            _record = SubmissionRecord(ReturnCode.ERROR, retcode, pid)
            _record.add_info("stderr", str(err))
            return _record
Esempio n. 2
0
    def submit(self, step, path, cwd, job_map=None, env=None):
        """
        Submit a script to the Slurm scheduler.

        :param step: The StudyStep instance this submission is based on.
        :param path: Local path to the script to be executed.
        :param cwd: Path to the current working directory.
        :param job_map: A dictionary mapping step names to their job
            identifiers.
        :param env: A dict containing a modified environment for execution.
        :returns: The return status of the submission command and job
            identiifer.
        """
        # Leading command is 'sbatch'
        cmd = ["sbatch"]
        # Check and see if we should be submitting into a reservation.
        if "reservation" in self._batch:
            if self._batch["reservation"]:
                cmd += ["--reservation", self._batch["reservation"]]

        # Append the script path and working directory.
        cmd += ["-D", cwd, path]
        cmd = " ".join(cmd)

        LOGGER.debug("cwd = %s", cwd)
        LOGGER.debug("Command to execute: %s", cmd)
        p = start_process(cmd, cwd=cwd, env=env)
        output, err = p.communicate()
        retcode = p.wait()

        # TODO: We need to check for dependencies here. The sbatch is where
        # dependent batch jobs are specified. If we're trying to launch
        # everything at once then that should happen here.

        if retcode == 0:
            LOGGER.info("Submission returned status OK.")
            jid = re.search('[0-9]+', output).group(0)
            return SubmissionRecord(SubmissionCode.OK, retcode, jid)
        else:
            LOGGER.warning("Submission returned an error (see next line).\n%s",
                           err)
            return SubmissionRecord(SubmissionCode.ERROR, retcode)
    def submit(self, step, path, cwd, job_map=None, env=None):
        """
        Execute the step locally.

        If cwd is specified, the submit method will operate outside of the path
        specified by the 'cwd' parameter.
        If env is specified, the submit method will set the environment
        variables for submission to the specified values. The 'env' parameter
        should be a dictionary of environment variables.

        :param step: An instance of a StudyStep.
        :param path: Path to the script to be executed.
        :param cwd: Path to the current working directory.
        :param job_map: A map of workflow step names to their job identifiers.
        :param env: A dict containing a modified environment for execution.
        :returns: The return code of the submission command and job identiifer.
        """
        LOGGER.debug("cwd = %s", cwd)
        LOGGER.debug("Script to execute: %s", path)
        p = start_process(path, shell=False, cwd=cwd, env=env)
        pid = p.pid
        output, err = p.communicate()
        retcode = p.wait()

        o_path = os.path.join(cwd, "{}.{}.out".format(step.name, pid))
        e_path = os.path.join(cwd, "{}.{}.err".format(step.name, pid))

        with open(o_path, "w") as out:
            out.write(output)

        with open(e_path, "w") as out:
            out.write(err)

        if retcode == 0:
            LOGGER.info("Execution returned status OK.")
            return SubmissionRecord(SubmissionCode.OK, retcode, pid)
        else:
            LOGGER.warning("Execution returned an error: %s", str(err))
            _record = SubmissionRecord(SubmissionCode.ERROR, retcode, pid)
            _record.add_info("stderr", str(err))
            return _record
Esempio n. 4
0
    def cancel_jobs(self, joblist):
        """
        For the given job list, cancel each job.

        :param joblist: A list of job identifiers to be cancelled.
        :returns: The return code to indicate if jobs were cancelled.
        """
        # If we don't have any jobs to check, just return status OK.
        if not joblist:
            return CancellationRecord(CancelCode.OK, 0)

        cmd = "scancel --quiet {}".format(" ".join(joblist))
        p = start_process(cmd)
        output, err = p.communicate()
        retcode = p.wait()

        if retcode == 0:
            _record = CancellationRecord(CancelCode.OK, retcode)
        else:
            LOGGER.error("Error code '%s' seen. Unexpected behavior "
                         "encountered.")
            _record = CancellationRecord(CancelCode.ERROR, retcode)

        return _record
Esempio n. 5
0
def run_study(args):
    """Run a Maestro study."""
    # Load the Specification
    try:
        spec = YAMLSpecification.load_specification(args.specification)
    except jsonschema.ValidationError as e:
        LOGGER.error(e.message)
        sys.exit(1)
    environment = spec.get_study_environment()
    steps = spec.get_study_steps()

    # Set up the output directory.
    out_dir = environment.remove("OUTPUT_PATH")
    if args.out:
        # If out is specified in the args, ignore OUTPUT_PATH.
        output_path = os.path.abspath(args.out)

        # If we are automatically launching, just set the input as yes.
        if os.path.exists(output_path):
            if args.autoyes:
                uinput = "y"
            elif args.autono:
                uinput = "n"
            else:
                uinput = six.moves.input(
                    "Output path already exists. Would you like to overwrite "
                    "it? [yn] ")

            if uinput.lower() in ACCEPTED_INPUT:
                print("Cleaning up existing out path...")
                shutil.rmtree(output_path)
            else:
                print("Opting to quit -- not cleaning up old out path.")
                sys.exit(0)

    else:
        if out_dir is None:
            # If we don't find OUTPUT_PATH in the environment, assume pwd.
            out_dir = os.path.abspath("./")
        else:
            # We just take the value from the environment.
            out_dir = os.path.abspath(out_dir.value)

        out_name = "{}_{}".format(spec.name.replace(" ", "_"),
                                  time.strftime("%Y%m%d-%H%M%S"))
        output_path = make_safe_path(out_dir, *[out_name])
    environment.add(Variable("OUTPUT_PATH", output_path))

    # Set up file logging
    create_parentdir(os.path.join(output_path, "logs"))
    log_path = os.path.join(output_path, "logs", "{}.log".format(spec.name))
    LOG_UTIL.add_file_handler(log_path, LFORMAT, args.debug_lvl)

    # Check for pargs without the matching pgen
    if args.pargs and not args.pgen:
        msg = "Cannot use the 'pargs' parameter without specifying a 'pgen'!"
        LOGGER.exception(msg)
        raise ArgumentError(msg)

    # Addition of the $(SPECROOT) to the environment.
    spec_root = os.path.split(args.specification)[0]
    spec_root = Variable("SPECROOT", os.path.abspath(spec_root))
    environment.add(spec_root)

    # Handle loading a custom ParameterGenerator if specified.
    if args.pgen:
        # 'pgen_args' has a default of an empty list, which should translate
        # to an empty dictionary.
        kwargs = create_dictionary(args.pargs)
        # Copy the Python file used to generate parameters.
        shutil.copy(args.pgen, output_path)

        # Add keywords and environment from the spec to pgen args.
        kwargs["OUTPUT_PATH"] = output_path
        kwargs["SPECROOT"] = spec_root

        # Load the parameter generator.
        parameters = load_parameter_generator(args.pgen, environment, kwargs)
    else:
        parameters = spec.get_parameters()

    # Setup the study.
    study = Study(spec.name,
                  spec.description,
                  studyenv=environment,
                  parameters=parameters,
                  steps=steps,
                  out_path=output_path)

    # Check if the submission attempts is greater than 0:
    if args.attempts < 1:
        _msg = "Submission attempts must be greater than 0. " \
               "'{}' provided.".format(args.attempts)
        LOGGER.error(_msg)
        raise ArgumentError(_msg)

    # Check if the throttle is zero or greater:
    if args.throttle < 0:
        _msg = "Submission throttle must be a value of zero or greater. " \
               "'{}' provided.".format(args.throttle)
        LOGGER.error(_msg)
        raise ArgumentError(_msg)

    # Check if the restart limit is zero or greater:
    if args.rlimit < 0:
        _msg = "Restart limit must be a value of zero or greater. " \
               "'{}' provided.".format(args.rlimit)
        LOGGER.error(_msg)
        raise ArgumentError(_msg)

    # Set up the study workspace and configure it for execution.
    study.setup_workspace()
    study.configure_study(throttle=args.throttle,
                          submission_attempts=args.attempts,
                          restart_limit=args.rlimit,
                          use_tmp=args.usetmp,
                          hash_ws=args.hashws,
                          dry_run=args.dry)
    study.setup_environment()

    if args.dry:
        # If performing a dry run, drive sleep time down to generate scripts.
        sleeptime = 1
    else:
        # else, use args to decide sleeptime
        sleeptime = args.sleeptime

    batch = {"type": "local"}
    if spec.batch:
        batch = spec.batch
        if "type" not in batch:
            batch["type"] = "local"
    # Copy the spec to the output directory
    shutil.copy(args.specification, study.output_path)

    # Use the Conductor's classmethod to store the study.
    Conductor.store_study(study)
    Conductor.store_batch(study.output_path, batch)

    # If we are automatically launching, just set the input as yes.
    if args.autoyes or args.dry:
        uinput = "y"
    elif args.autono:
        uinput = "n"
    else:
        uinput = six.moves.input("Would you like to launch the study? [yn] ")

    if uinput.lower() in ACCEPTED_INPUT:
        if args.fg:
            # Launch in the foreground.
            LOGGER.info("Running Maestro Conductor in the foreground.")
            conductor = Conductor(study)
            conductor.initialize(batch, sleeptime)
            completion_status = conductor.monitor_study()
            conductor.cleanup()
            return completion_status.value
        else:
            # Launch manager with nohup
            log_path = make_safe_path(study.output_path,
                                      *["{}.txt".format(study.name)])

            cmd = [
                "nohup", "conductor", "-t",
                str(sleeptime), "-d",
                str(args.debug_lvl), study.output_path, ">", log_path, "2>&1"
            ]
            LOGGER.debug(" ".join(cmd))
            start_process(" ".join(cmd))

            print("Study launched successfully.")
    else:
        print("Study launch aborted.")

    return 0
Esempio n. 6
0
    def check_jobs(self, joblist):
        """
        For the given job list, query execution status.

        This method uses the scontrol show job <jobid> command and does a
        regex search for job information.

        :param joblist: A list of job identifiers to be queried.
        :returns: The return code of the status query, and a dictionary of job
            identifiers to their status.
        """
        # TODO: This method needs to be updated to use sacct.
        # squeue options:
        # -u = username to search queues for.
        # -t = list of job states to search for. 'all' for all states.
        cmd = "squeue -u $USER -t all"
        p = start_process(cmd)
        output, err = p.communicate()
        retcode = p.wait()

        status = {}
        for jobid in joblist:
            LOGGER.debug("Looking for jobid %s", jobid)
            status[jobid] = None

        if retcode == 0:
            for job in output.split("\n")[1:]:
                LOGGER.debug("Job Entry: %s", job)
                # The squeue command output is split with the following indices
                # used for specific information:
                # 0 - Job Identifier
                # 1 - Queue
                # 2 - Job name
                # 3 - User
                # 4 - State [Passed to _state]
                # 5 - Current Execution Time
                # 6 - Assigned Node Count
                # 7 - Hostname and assigned node identifier list
                job_split = re.split(r"\s+", job)
                state_index = 4
                jobid_index = 0
                if job_split[0] == "":
                    LOGGER.debug("Removing blank entry from head of status.")
                    job_split = job_split[1:]

                LOGGER.debug("Entry split: %s", job_split)
                if not job_split:
                    LOGGER.debug("Continuing...")
                    continue

                if job_split[jobid_index] in status:
                    LOGGER.debug("ID Found. %s -- %s", job_split[state_index],
                                 self._state(job_split[state_index]))
                    status[job_split[jobid_index]] = \
                        self._state(job_split[state_index])

            return JobStatusCode.OK, status
        elif retcode == 1:
            LOGGER.warning("User '%s' has no jobs executing. Returning.",
                           getpass.getuser())
            return JobStatusCode.NOJOBS, status
        else:
            LOGGER.error("Error code '%s' seen. Unexpected behavior "
                         "encountered.")
            return JobStatusCode.ERROR, status
Esempio n. 7
0
def run_study(args):
    """Run a Maestro study."""
    # Load the Specification
    spec = YAMLSpecification.load_specification(args.specification)
    environment = spec.get_study_environment()
    steps = spec.get_study_steps()

    # Set up the output directory.
    out_dir = environment.remove("OUTPUT_PATH")
    if args.out:
        # If out is specified in the args, ignore OUTPUT_PATH.
        output_path = os.path.abspath(args.out)

        # If we are automatically launching, just set the input as yes.
        if os.path.exists(output_path):
            if args.autoyes:
                uinput = "y"
            elif args.autono:
                uinput = "n"
            else:
                uinput = six.moves.input(
                    "Output path already exists. Would you like to overwrite "
                    "it? [yn] ")

            if uinput.lower() in ACCEPTED_INPUT:
                print("Cleaning up existing out path...")
                shutil.rmtree(output_path)
            else:
                print("Opting to quit -- not cleaning up old out path.")
                sys.exit(0)

    else:
        if out_dir is None:
            # If we don't find OUTPUT_PATH in the environment, assume pwd.
            out_dir = os.path.abspath("./")
        else:
            # We just take the value from the environment.
            out_dir = os.path.abspath(out_dir.value)

        out_name = "{}_{}".format(spec.name.replace(" ", "_"),
                                  time.strftime("%Y%m%d-%H%M%S"))
        output_path = make_safe_path(out_dir, *[out_name])
    environment.add(Variable("OUTPUT_PATH", output_path))

    # Now that we know outpath, set up logging.
    setup_logging(args, output_path, spec.name.replace(" ", "_").lower())

    # Check for pargs without the matching pgen
    if args.pargs and not args.pgen:
        msg = "Cannot use the 'pargs' parameter without specifying a 'pgen'!"
        LOGGER.exception(msg)
        raise ArgumentError(msg)

    # Handle loading a custom ParameterGenerator if specified.
    if args.pgen:
        # 'pgen_args' has a default of an empty list, which should translate
        # to an empty dictionary.
        kwargs = create_dictionary(args.pargs)
        # Copy the Python file used to generate parameters.
        shutil.copy(args.pgen, output_path)
        parameters = load_parameter_generator(args.pgen, kwargs)
    else:
        parameters = spec.get_parameters()

    # Addition of the $(SPECROOT) to the environment.
    spec_root = os.path.split(args.specification)[0]
    spec_root = Variable("SPECROOT", os.path.abspath(spec_root))
    environment.add(spec_root)

    # Setup the study.
    study = Study(spec.name,
                  spec.description,
                  studyenv=environment,
                  parameters=parameters,
                  steps=steps,
                  out_path=output_path)

    # Check if the submission attempts is greater than 0:
    if args.attempts < 1:
        _msg = "Submission attempts must be greater than 0. " \
               "'{}' provided.".format(args.attempts)
        LOGGER.error(_msg)
        raise ArgumentError(_msg)

    # Check if the throttle is zero or greater:
    if args.throttle < 0:
        _msg = "Submission throttle must be a value of zero or greater. " \
               "'{}' provided.".format(args.throttle)
        LOGGER.error(_msg)
        raise ArgumentError(_msg)

    # Check if the restart limit is zero or greater:
    if args.rlimit < 0:
        _msg = "Restart limit must be a value of zero or greater. " \
               "'{}' provided.".format(args.rlimit)
        LOGGER.error(_msg)
        raise ArgumentError(_msg)

    # Set up the study workspace and configure it for execution.
    study.setup_workspace()
    study.setup_environment()
    study.configure_study(throttle=args.throttle,
                          submission_attempts=args.attempts,
                          restart_limit=args.rlimit,
                          use_tmp=args.usetmp,
                          hash_ws=args.hashws)

    # Stage the study.
    path, exec_dag = study.stage()
    # Write metadata
    study.store_metadata()

    if not spec.batch:
        exec_dag.set_adapter({"type": "local"})
    else:
        if "type" not in spec.batch:
            spec.batch["type"] = "local"

        exec_dag.set_adapter(spec.batch)

    # Copy the spec to the output directory
    shutil.copy(args.specification, path)

    # Check for a dry run
    if args.dryrun:
        raise NotImplementedError("The 'dryrun' mode is in development.")

    # Pickle up the DAG
    pkl_path = make_safe_path(path, *["{}.pkl".format(study.name)])
    exec_dag.pickle(pkl_path)

    # If we are automatically launching, just set the input as yes.
    if args.autoyes:
        uinput = "y"
    elif args.autono:
        uinput = "n"
    else:
        uinput = six.moves.input("Would you like to launch the study? [yn] ")

    if uinput.lower() in ACCEPTED_INPUT:
        if args.fg:
            # Launch in the foreground.
            LOGGER.info("Running Maestro Conductor in the foreground.")
            cancel_path = os.path.join(path, ".cancel.lock")
            # capture the StudyStatus enum to return
            completion_status = monitor_study(exec_dag, pkl_path, cancel_path,
                                              args.sleeptime)
            return completion_status.value
        else:
            # Launch manager with nohup
            log_path = make_safe_path(study.output_path,
                                      *["{}.txt".format(exec_dag.name)])

            cmd = [
                "nohup", "conductor", "-t",
                str(args.sleeptime), "-d",
                str(args.debug_lvl), path, "&>", log_path
            ]
            LOGGER.debug(" ".join(cmd))
            start_process(" ".join(cmd))

            print("Study launched successfully.")
    else:
        print("Study launch aborted.")

    return 0
Esempio n. 8
0
    def acquire(self, substitutions=None):
        """
        Acquire the dependency specified by the PathDependency.

        The GitDependency will clone the remote repository specified by the
        instance's value to the local repository specified by path. If a commit
        hash is specified, acquire will attempt to rebase to the repository
        version described by the hash. Alternatively, if a tag is specfied
        acquire will attempt to checkout the version labeled by the tag.

        :param substitutions: List of Substitution objects that can be applied.
        """
        if self._is_acquired:
            return

        if not self._verify():
            error = "Ensure that all required fields (name, value, " \
                    "path), are populated and that value is a " \
                    "valid path."
            logger.error(error)
            raise ValueError(error)

        if substitutions:
            for substitution in substitutions:
                self.path = substitution.substitute(self.path)
                self.url = substitution.substitute(self.url)

        path = os.path.join(self.path, self.name)

        # Moved the path existence here because git doesn't actually return a
        # specific enough error code.
        if os.path.exists(path):
            msg = "Destination path '{}' already exists and is not an " \
                  "empty directory.".format(path)
            logger.error(msg)
            raise Exception(msg)

        logger.info("Checking for connectivity to '%s'", self.url)
        p = start_process(["git", "ls-remote", self.url], shell=False)
        retcode = p.wait()
        if retcode != 0:
            msg = "Connectivity check failed. Check that you have " \
                "permissions to the specified repository, that the URL is " \
                "correct, and that you have network connectivity. (url = {})" \
                .format(self.url)
            logger.error(msg)
            raise RuntimeError(msg)
        logger.info("Connectivity achieved!")

        logger.info("Cloning '%s' from '%s'...", self.name, self.url)
        clone = start_process(["git", "clone", self.url, path], shell=False)
        retcode = clone.wait()
        if retcode != 0:
            msg = "Failed to acquire GitDependency named '{}'. Check " \
              "that repository URL ({}) and repository local path ({}) " \
              "are valid.".format(self.name, self.url, path)
            logger.error(msg)
            raise Exception(msg)

        if self.hash:
            logger.info("Checking out SHA1 hash '%s'...", self.hash)
            chkout = start_process(["git", "checkout", self.hash],
                                   cwd=path, shell=False)
            retcode = chkout.wait()

            if retcode != 0:
                msg = "Unable to checkout SHA1 hash '{}' for the repository" \
                      " located at {}." \
                      .format(self.hash, self.url)
                logger.error(msg)
                raise ValueError(msg)

        if self.tag:
            logger.info("Checking out git tag '%s'...", self.tag)
            tag = "tags/{}".format(self.tag)
            chkout = start_process(["git", "checkout", tag],
                                   cwd=path, shell=False)

            retcode = chkout.wait()

            if retcode != 0:
                msg = "Unable to checkout tag '{}' for the repository" \
                      " located at {}".format(self.tag, self.url)
                logger.error(msg)
                raise ValueError(msg)

        if self.branch:
            logger.info("Checking out git branch '%s'...", self.branch)
            chkout = start_process(["git", "checkout", self.branch],
                                   cwd=path, shell=False)

            retcode = chkout.wait()

            if retcode != 0:
                msg = "Unable to checkout branch '{}' for the repository" \
                      " located at {}".format(self.tag, self.url)
                logger.error(msg)
                raise ValueError(msg)

        if not os.path.exists(self.path):
            error = "The specified path '{}' does not exist.".format(self.name)
            logger.exception(error)
            raise ValueError(error)

        self._is_acquired = True