Ejemplo n.º 1
0
def create_cmd_from_task(task):
    filename = os.path.basename(get_filename())

    prefix = get_setting("executable_prefix",
                         task=task,
                         default=[],
                         deprecated_keys=["cmd_prefix"])

    if isinstance(prefix, str):
        raise ValueError(
            "Your specified executable_prefix needs to be a list of strings, e.g. [strace]"
        )

    cmd = prefix

    executable = get_setting("executable", task=task, default=[sys.executable])

    if isinstance(executable, str):
        raise ValueError(
            "Your specified executable needs to be a list of strings, e.g. [python3]"
        )

    cmd += executable
    cmd += [filename, "--batch-runner", "--task-id", task.task_id]

    return cmd
Ejemplo n.º 2
0
    def start_job(self):
        command = ["bsub", "-env all"]

        queue = get_setting("queue", task=self.task, default=False)
        if queue is not False:
            command += ["-q", queue]

        job_name = get_setting("job_name", task=self.task, default=False)
        if job_name is not False:
            command += ["-J", job_name]

        log_file_dir = get_log_file_dir(self.task)
        os.makedirs(log_file_dir, exist_ok=True)

        stdout_log_file = os.path.join(log_file_dir, "stdout")
        stderr_log_file = os.path.join(log_file_dir, "stderr")

        command += ["-eo", stderr_log_file, "-oo", stdout_log_file]

        executable_file = create_executable_wrapper(self.task)
        command.append(executable_file)

        output = subprocess.check_output(command)
        output = output.decode()

        # Output of the form Job <72065926> is submitted to default queue <s>.
        match = re.search(r"<[0-9]+>", output)
        if not match:
            raise RuntimeError("Batch submission failed with output " + output)

        self._batch_job_id = match.group(0)[1:-1]
Ejemplo n.º 3
0
def create_executable_wrapper(task):
    """
    To incorporate all settings (environment, working paths, remote or locally)
    we create an executable bash script which is called instead of the application
    and which will setup everything accordingly before doing the actual work.
    """
    shell = get_setting("shell", task=task, default="bash")
    executable_wrapper_content = [f"#!/bin/{shell}", "set -e"]

    # 1. First part is the folder we need to change if given
    working_dir = get_setting("working_dir",
                              task=task,
                              default=os.path.abspath(
                                  os.path.dirname(get_filename())))
    executable_wrapper_content.append(f"cd {working_dir}")

    executable_wrapper_content.append("echo 'Working in the folder:'; pwd")

    # 2. Second part of the executable wrapper, the environment.
    executable_wrapper_content.append("echo 'Setting up the environment'")
    # (a) If given, use the environment script
    env_setup_script = get_setting("env_script", task=task, default="")
    if env_setup_script:
        # The script will be called from the directory of the script. So we have to make sure the
        # env_script is reachable from there (not from where we are currently)
        if not os.path.isfile(map_folder(env_setup_script)):
            raise FileNotFoundError(
                f"Environment setup script {env_setup_script} does not exist.")
        executable_wrapper_content.append(f"source {env_setup_script}")

    # (b) Now override with any environment from the task or settings
    env_overrides = get_setting("env", task=task, default={})
    for key, value in env_overrides.items():
        value = value.replace("'", "'\''")
        value = f"'{value}'"
        executable_wrapper_content.append(f"export {key}={value}")

    executable_wrapper_content.append("echo 'Current environment:'; env")

    # 3. Third part is to call the actual program
    command = " ".join(create_cmd_from_task(task))
    executable_wrapper_content.append("echo 'Will now execute the program'")
    executable_wrapper_content.append(f"exec {command}")

    # Now we can write the file
    executable_file_dir = get_task_file_dir(task)
    os.makedirs(executable_file_dir, exist_ok=True)

    executable_wrapper_path = os.path.join(executable_file_dir,
                                           "executable_wrapper.sh")

    with open(executable_wrapper_path, "w") as f:
        f.write("\n".join(executable_wrapper_content))

    # make wrapper executable
    st = os.stat(executable_wrapper_path)
    os.chmod(executable_wrapper_path, st.st_mode | stat.S_IEXEC)

    return executable_wrapper_path
Ejemplo n.º 4
0
def get_unique_project_name(task):
    """
    Combine the ``gbasf2_project_name_prefix`` setting and the ``task_id`` hash
    to a unique project name.

    This is done to make sure that different instances of a task with different
    luigi parameters result in different gbasf2 project names. When trying to
    redoing a task on the grid with identical parameters, rename the project
    name prefix, to ensure that you get a new project.
    """
    try:
        gbasf2_project_name_prefix = get_setting("gbasf2_project_name_prefix",
                                                 task=task)
    except AttributeError as err:
        raise Exception(
            "Task can only be used with the gbasf2 batch process if it has ``gbasf2_project_name_prefix`` "
            + "as a luigi parameter, attribute or setting.") from err
    # luigi interally assings a hash to a task by calling the builtin ``hash(task.task_id)``,
    # but that returns a signed integer. I prefer a hex string to get more information per character,
    # which is why I decided to use ``hashlib.md5``.
    task_id_hash = hashlib.md5(task.task_id.encode()).hexdigest()[0:10]
    gbasf2_project_name = gbasf2_project_name_prefix + task_id_hash
    max_project_name_length = 32
    assert len(gbasf2_project_name) <= max_project_name_length,\
        f"Maximum length of project name should be {max_project_name_length}, " + \
        f"but has {len(gbasf2_project_name)} chars." + \
        f"Please choose a gbasf2_project_name_prefix of less than {max_project_name_length - len(task_id_hash)} characters," + \
        f" since the unique task id hash takes {len(task_id_hash)} characters."
    assert gbasf2_project_name.isalnum(
    ), "Only alphanumeric project names are officially supported by gbasf2"
    return gbasf2_project_name
Ejemplo n.º 5
0
def get_gbasf2_env(gbasf2_install_directory=None):
    """
    Return the gbasf2 environment dict which can be used to run gbasf2 commands.

    :param gbasf2_install_directory: Directory into which gbasf2 has been
        installed.  When set to the default value ``None``, it looks for the
        value of the ``gbasf2_install_directory`` setting and when that is not
        set, it uses the default of most installation instructions, which is
        ``~/gbasf2KEK``.
    :return: Dictionary containing the  environment that you get from sourcing the gbasf2 setup script.
    """
    if gbasf2_install_directory is None:
        gbasf2_install_directory = get_setting("gbasf2_install_directory",
                                               default="~/gbasf2KEK")
    gbasf2_setup_path = os.path.join(gbasf2_install_directory,
                                     "BelleDIRAC/gbasf2/tools/setup")
    if not os.path.isfile(os.path.expanduser(gbasf2_setup_path)):
        raise FileNotFoundError(
            f"Could not find gbasf2 setup files in ``{gbasf2_install_directory}``.\n"
            + "Make sure to that gbasf2 is installed at that location.")
    # complete bash command to set up the gbasf2 environment
    # piping output to /dev/null, because we want that our final script only prints the ``env`` output
    gbasf2_setup_command_str = f"source {gbasf2_setup_path} > /dev/null"
    # command to execute the gbasf2 setup command in a fresh shell and output the produced environment
    echo_gbasf2_env_command = shlex.split(
        f"env -i bash -c '{gbasf2_setup_command_str} > /dev/null && env'")
    gbasf2_env_string = subprocess.run(echo_gbasf2_env_command,
                                       check=True,
                                       stdout=subprocess.PIPE,
                                       encoding="utf-8").stdout
    gbasf2_env = dict(
        line.split("=", 1) for line in gbasf2_env_string.splitlines())
    return gbasf2_env
Ejemplo n.º 6
0
def create_output_file_name(task, base_filename, result_dir=None):
    serialized_parameters = get_serialized_parameters(task)

    if not result_dir:
        # Be sure to evaluate things relative to the current executed file, not to where we are now
        result_dir = map_folder(
            get_setting("result_dir",
                        task=task,
                        default=".",
                        deprecated_keys=["result_path"]))

    for key, value in serialized_parameters.items():
        if isinstance(key, str) and "/" in value:
            warnings.warn(
                f"Value of parameter ``{key}`` contains forward slash \"/\". "
                "This will result in an additional subdirectory in the output path. "
                "Consider using a hashed parameter (e.g. ``b2luigi.Parameter(hashed=True)``)"
            )

    param_list = [
        f"{key}={value}" for key, value in serialized_parameters.items()
    ]
    output_path = os.path.join(result_dir, *param_list)

    return os.path.join(output_path, base_filename)
Ejemplo n.º 7
0
 def _on_first_success_action(self):
     """
     Things to do after all jobs in the project had been successful, e.g. downloading the dataset and logs
     """
     self._download_logs()
     if get_setting("gbasf2_download_dataset", default=True,
                    task=self.task):
         self._download_dataset()
Ejemplo n.º 8
0
 def wrapped_run_function(task):
     if get_setting("_dispatch_local_execution",
                    default=False,
                    deprecated_keys=["local_execution"]):
         create_output_dirs(task)
         run_function(task)
     else:
         run_task_remote(task)
Ejemplo n.º 9
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        #: gbasf2 project name, must be property/attribute, e.g. a luigi parameter
        # Setting it via a setting.json file is not supported to make sure users set unique project names
        self.gbasf2_project_name = get_unique_project_name(self.task)

        #: Output file directory of the task to wrap with gbasf2, where we will
        # store the pickled basf2 path and the created steerinfile to execute
        # that path.
        task_file_dir = get_task_file_dir(self.task)
        os.makedirs(task_file_dir, exist_ok=True)
        #: file name in which the pickled basf2 path from ``self.task.create_path()`` will be stored
        self.pickle_file_path = os.path.join(task_file_dir,
                                             "serialized_basf2_path.pkl")
        #: file name for steering file that executes pickled path, which will be send to the grid
        self.wrapper_file_path = os.path.join(task_file_dir,
                                              "steering_file_wrapper.py")

        self.log_file_dir = get_log_file_dir(self.task)
        os.makedirs(self.log_file_dir, exist_ok=True)

        self.dirac_user = get_dirac_user()
        #: Maximum number of times that each job in the project can be rescheduled until the project is declared as failed.
        self.max_retries = get_setting("gbasf2_max_retries",
                                       default=0,
                                       task=self.task)

        #: Store number of times each job had been rescheduled
        self.n_retries_by_job = Counter()

        #: Local storage for ``n_retries_by_job`` counter
        # so that it persists even if luigi process is killed and restarted.
        self.retries_file_path = os.path.join(self.log_file_dir,
                                              "n_retries_by_grid_job.json")
        if os.path.isfile(self.retries_file_path):
            with open(self.retries_file_path, "r") as retries_file:
                retries_from_file = json.load(retries_file)
                self.n_retries_by_job.update(retries_from_file)

        # Store dictionary with n_jobs_by_status in attribute to check if it changed,
        # useful for printing job status on change only
        self._n_jobs_by_status = ""

        # Store whether the job had already been successful in a variable b/c
        # there's actions we want to do only the first time that
        # ``get_job_status`` returns a success.
        self._project_had_been_successful = False
Ejemplo n.º 10
0
def get_log_file_dir(task):
    if hasattr(task, 'get_log_file_dir'):
        log_file_dir = task.get_log_file_dir()
        return log_file_dir

    # Be sure to evaluate things relative to the current executed file, not to where we are now
    base_log_file_dir = map_folder(
        get_setting("log_dir",
                    task=task,
                    default="logs",
                    deprecated_keys=["log_folder"]))
    log_file_dir = create_output_file_name(task,
                                           task.get_task_family() + "/",
                                           result_dir=base_log_file_dir)

    return log_file_dir
Ejemplo n.º 11
0
 def _create_wrapper_steering_file(self):
     """
     Create a steering file to send to the grid that executes the pickled
     basf2 path from ``self.task.create_path()``.
     """
     # read a jinja2 template for the steerinfile that should execute the pickled path
     template_file_path = os.path.join(
         self._file_dir, "templates/gbasf2_steering_file_wrapper.jinja2")
     with open(template_file_path, "r") as template_file:
         template = Template(template_file.read())
         # replace some variable values in the templates
         steering_file_stream = template.stream(
             pickle_file_path=os.path.basename(self.pickle_file_path),
             max_event=get_setting("max_event", default=0, task=self.task),
         )
         # write the template with the replacements to a new file which should be sent to the grid
         steering_file_stream.dump(self.wrapper_file_path)
Ejemplo n.º 12
0
def create_output_file_name(task, base_filename, result_dir=None):
    serialized_parameters = get_serialized_parameters(task)

    if not result_dir:
        # Be sure to evaluate things relative to the current executed file, not to where we are now
        result_dir = map_folder(
            get_setting("result_dir",
                        task=task,
                        default=".",
                        deprecated_keys=["result_path"]))

    param_list = [
        f"{key}={value}" for key, value in serialized_parameters.items()
    ]
    output_path = os.path.join(result_dir, *param_list)

    return os.path.join(output_path, base_filename)
Ejemplo n.º 13
0
    def _create_task_process(self, task):
        batch_system = BatchSystems(get_setting("batch_system", default=BatchSystems.lsf, task=task))

        if batch_system == BatchSystems.lsf:
            process_class = LSFProcess
        elif batch_system == BatchSystems.htcondor:
            process_class = HTCondorProcess
        elif batch_system == BatchSystems.gbasf2:
            process_class = Gbasf2Process
        elif batch_system == BatchSystems.test:
            process_class = TestProcess
        elif batch_system == BatchSystems.local:
            create_output_dirs(task)
            return super()._create_task_process(task)
        else:
            raise NotImplementedError

        return process_class(task=task, scheduler=self._scheduler,
                             result_queue=self._task_result_queue, worker_timeout=self._config.timeout)
Ejemplo n.º 14
0
    def _build_gbasf2_submit_command(self):
        """
        Function to create the gbasf2 submit command to pass to run_with_gbasf2
        from the task options and attributes.
        """
        gbasf2_release = get_setting("gbasf2_release",
                                     default=get_basf2_git_hash(),
                                     task=self.task)
        gbasf2_additional_files = get_setting("gbasf2_additional_files",
                                              default=[],
                                              task=self.task)
        assert not isinstance(
            gbasf2_additional_files, str
        ), "gbasf2_additional_files should be a list or tuple, not a string."
        gbasf2_input_sandbox_files = [os.path.basename(self.pickle_file_path)
                                      ] + gbasf2_additional_files
        gbasf2_command_str = (
            f"gbasf2 {self.wrapper_file_path} -f {' '.join(gbasf2_input_sandbox_files)} "
            + f"-p {self.gbasf2_project_name} -s {gbasf2_release} ")

        gbasf2_input_dataset = get_setting("gbasf2_input_dataset",
                                           default=False,
                                           task=self.task)
        if gbasf2_input_dataset is not False:
            gbasf2_command_str += f" -i {gbasf2_input_dataset} "

        gbasf2_n_repition_jobs = get_setting("gbasf2_n_repition_job",
                                             default=False,
                                             task=self.task)
        if gbasf2_n_repition_jobs is not False:
            gbasf2_command_str += f" --repetition {gbasf2_n_repition_jobs} "

        # now add some additional optional options to the gbasf2 job submission string

        # whether to ask user for confirmation before submitting job
        force_submission = get_setting("gbasf2_force_submission",
                                       default=True,
                                       task=self.task)
        if force_submission:
            gbasf2_command_str += " --force "

        # estimated cpu time per sub-job in minutes
        cpu_minutes = get_setting("gbasf2_cputime",
                                  default=False,
                                  task=self.task)
        if cpu_minutes is not False:
            gbasf2_command_str += f" --cputime {cpu_minutes} "

        # estimated number or processed events per second
        evtpersec = get_setting("gbasf2_evtpersec",
                                default=False,
                                task=self.task)
        if evtpersec is not False:
            gbasf2_command_str += f" --evtpersec {evtpersec} "

        # gbasf2 job priority
        priority = get_setting("gbasf2_priority",
                               default=False,
                               task=self.task)
        if priority is not False:
            assert 0 <= priority <= 10, "Priority should be integer between 0 and 10."
            gbasf2_command_str += f" --priority {priority} "

        # gbasf2 job type (e.g. User, Production, ...)
        jobtype = get_setting("gbasf2_jobtype", default=False, task=self.task)
        if jobtype is not False:
            gbasf2_command_str += f" --jobtype {jobtype} "

        # additional basf2 options to use on grid
        basf2opt = get_setting("gbasf2_basf2opt",
                               default=False,
                               task=self.task)
        if basf2opt is not False:
            gbasf2_command_str += f" --basf2opt='{basf2opt}' "

        # optional string of additional parameters to append to gbasf2 command
        gbasf2_additional_params = get_setting("gbasf2_additional_params",
                                               default=False,
                                               task=self.task)
        if basf2opt is not False:
            gbasf2_command_str += f" {gbasf2_additional_params} "

        gbasf2_command = shlex.split(gbasf2_command_str)
        return gbasf2_command
Ejemplo n.º 15
0
    def get_job_status(self):
        """
        Get overall status of the gbasf2 project.

        First obtain the status of all (sub-) jobs in a gbasf2 project, similar
        to ``gb2_job_status``, and return an overall project status, e.g. when
        all jobs are done, return ``JobStatus.successful`` to show that the
        gbasf2 project succeeded.

        The status of each individual job can be one of::

            [Submitting, Submitted, Received, Checking, Staging, Waiting, Matched, Rescheduled,
             Running, Stalled, Completing, Done, Completed, Failed, Deleted, Killed]

        (Taken from  https://github.com/DIRACGrid/DIRAC/blob/rel-v7r1/WorkloadManagementSystem/Client/JobStatus.py)

        """
        # If project is does not exist on grid yet, so can't query for gbasf2 project status
        if not check_project_exists(self.gbasf2_project_name,
                                    dirac_user=self.dirac_user):
            raise RuntimeError(
                f"\nCould not find any jobs for project {self.gbasf2_project_name} on the grid.\n"
                +
                "Probably there was an error during the project submission when running the gbasf2 command.\n"
                +
                "Try if you can run the gbasf2 command used manually in a terminal with gbasf2 set up:\n"
                + " ".join(self._build_gbasf2_submit_command()))

        job_status_dict = get_gbasf2_project_job_status_dict(
            self.gbasf2_project_name, dirac_user=self.dirac_user)
        n_jobs_by_status = Counter()
        for _, job_info in job_status_dict.items():
            n_jobs_by_status[job_info["Status"]] += 1

        # print summary of jobs in project if setting is set and job status changed
        if (get_setting(
                "gbasf2_print_status_updates", default=True, task=self.task)
                and n_jobs_by_status != self._n_jobs_by_status):
            time_string = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            job_status_string = str(dict(sorted(
                n_jobs_by_status.items()))).strip("{}")
            print(
                f"Jobs in gbasf2 project \"{self.gbasf2_project_name}\" at {time_string}: {job_status_string}"
            )
        self._n_jobs_by_status = n_jobs_by_status

        n_jobs = len(job_status_dict)
        n_done = n_jobs_by_status["Done"]
        # note: "Killed" jobs also get moved to "Failed" after a while
        n_failed = n_jobs_by_status["Failed"]
        n_in_final_state = n_done + n_failed

        # The gbasf2 project is considered as failed if any of the jobs in it failed.
        # However, we first try to reschedule thos jobs and only declare it as failed if the maximum number of retries
        # for reschedulinhas been reached
        if n_failed > 0:
            self._on_failure_action()
            if self.max_retries > 0 and self._reschedule_failed_jobs():
                return JobStatus.running
            return JobStatus.aborted

        if n_in_final_state < n_jobs:
            return JobStatus.running

        # Require all jobs to be done for project success, any job failure results in a failed project
        if n_done == n_jobs:
            # download dataset only the first time that we return JobStatus.successful
            if not self._project_had_been_successful:
                self._on_first_success_action()
                self._project_had_been_successful = True
            return JobStatus.successful

        raise RuntimeError("Could not determine JobStatus")
Ejemplo n.º 16
0
    def _create_htcondor_submit_file(self):
        submit_file_content = []

        # Specify where to write the log to
        log_file_dir = get_log_file_dir(self.task)
        os.makedirs(log_file_dir, exist_ok=True)

        stdout_log_file = os.path.abspath(os.path.join(log_file_dir, "stdout"))
        submit_file_content.append(f"output = {stdout_log_file}")

        stderr_log_file = os.path.abspath(os.path.join(log_file_dir, "stderr"))
        submit_file_content.append(f"error = {stderr_log_file}")

        job_log_file = os.path.abspath(os.path.join(log_file_dir, "job.log"))
        submit_file_content.append(f"log = {job_log_file}")

        # Specify the executable
        executable_file = create_executable_wrapper(self.task)
        submit_file_content.append(
            f"executable = {os.path.basename(executable_file)}")

        # Specify additional settings
        general_settings = get_setting("htcondor_settings", dict())
        try:
            general_settings.update(self.task.htcondor_settings)
        except AttributeError:
            pass

        transfer_files = get_setting("transfer_files",
                                     task=self.task,
                                     default=[])
        if transfer_files:
            working_dir = get_setting("working_dir",
                                      task=self.task,
                                      default="")
            if not working_dir or working_dir != ".":
                raise ValueError(
                    "If using transfer_files, the working_dir must be explicitely set to '.'"
                )

            general_settings.setdefault("should_transfer_files", "YES")
            general_settings.setdefault("when_to_transfer_output", "ON_EXIT")

            transfer_files = set(transfer_files)

            for transfer_file in transfer_files:
                if os.path.abspath(transfer_file) != transfer_file:
                    raise ValueError(
                        "You should only give absolute file names in transfer_files!"
                        +
                        f"{os.path.abspath(transfer_file)} != {transfer_file}")

            env_setup_script = get_setting("env_script",
                                           task=self.task,
                                           default="")
            if env_setup_script:
                # TODO: make sure to call it relatively
                transfer_files.add(os.path.abspath(env_setup_script))

            general_settings.setdefault("transfer_input_files",
                                        ",".join(transfer_files))

        job_name = get_setting("job_name", task=self.task, default=False)
        if job_name is not False:
            general_settings.setdefault("JobBatchName", job_name)

        for key, item in general_settings.items():
            submit_file_content.append(f"{key} = {item}")

        # Finally also start the process
        submit_file_content.append("queue 1")

        # Now we can write the submit file
        output_path = get_task_file_dir(self.task)
        submit_file_path = os.path.join(output_path, "job.submit")

        os.makedirs(output_path, exist_ok=True)

        with open(submit_file_path, "w") as submit_file:
            submit_file.write("\n".join(submit_file_content))

        return submit_file_path