def start_job(self): command = ["bsub", "-env all"] queue = get_setting("queue", task=self.task, default=False) if queue is not False: command += ["-q", queue] job_name = get_setting("job_name", task=self.task, default=False) if job_name is not False: command += ["-J", job_name] log_file_dir = get_log_file_dir(self.task) os.makedirs(log_file_dir, exist_ok=True) stdout_log_file = os.path.join(log_file_dir, "stdout") stderr_log_file = os.path.join(log_file_dir, "stderr") command += ["-eo", stderr_log_file, "-oo", stdout_log_file] executable_file = create_executable_wrapper(self.task) command.append(executable_file) output = subprocess.check_output(command) output = output.decode() # Output of the form Job <72065926> is submitted to default queue <s>. match = re.search(r"<[0-9]+>", output) if not match: raise RuntimeError("Batch submission failed with output " + output) self._batch_job_id = match.group(0)[1:-1]
def start_job(self): log_file_dir = get_log_file_dir(self.task) os.makedirs(log_file_dir, exist_ok=True) stdout_log_file = os.path.join(log_file_dir, "stdout") stderr_log_file = os.path.join(log_file_dir, "stderr") executable_file = create_executable_wrapper(self.task) with open(stdout_log_file, "w") as stdout_file: with open(stderr_log_file, "w") as stderr_file: self._process = subprocess.Popen([executable_file], stdout=stdout_file, stderr=stderr_file)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) #: gbasf2 project name, must be property/attribute, e.g. a luigi parameter # Setting it via a setting.json file is not supported to make sure users set unique project names self.gbasf2_project_name = get_unique_project_name(self.task) #: Output file directory of the task to wrap with gbasf2, where we will # store the pickled basf2 path and the created steerinfile to execute # that path. task_file_dir = get_task_file_dir(self.task) os.makedirs(task_file_dir, exist_ok=True) #: file name in which the pickled basf2 path from ``self.task.create_path()`` will be stored self.pickle_file_path = os.path.join(task_file_dir, "serialized_basf2_path.pkl") #: file name for steering file that executes pickled path, which will be send to the grid self.wrapper_file_path = os.path.join(task_file_dir, "steering_file_wrapper.py") self.log_file_dir = get_log_file_dir(self.task) os.makedirs(self.log_file_dir, exist_ok=True) self.dirac_user = get_dirac_user() #: Maximum number of times that each job in the project can be rescheduled until the project is declared as failed. self.max_retries = get_setting("gbasf2_max_retries", default=0, task=self.task) #: Store number of times each job had been rescheduled self.n_retries_by_job = Counter() #: Local storage for ``n_retries_by_job`` counter # so that it persists even if luigi process is killed and restarted. self.retries_file_path = os.path.join(self.log_file_dir, "n_retries_by_grid_job.json") if os.path.isfile(self.retries_file_path): with open(self.retries_file_path, "r") as retries_file: retries_from_file = json.load(retries_file) self.n_retries_by_job.update(retries_from_file) # Store dictionary with n_jobs_by_status in attribute to check if it changed, # useful for printing job status on change only self._n_jobs_by_status = "" # Store whether the job had already been successful in a variable b/c # there's actions we want to do only the first time that # ``get_job_status`` returns a success. self._project_had_been_successful = False
def run_task_remote(task): """ Run a given task "remotely", which means create an exectable script and call it via a subprocess call. """ log_file_dir = get_log_file_dir(task) os.makedirs(log_file_dir, exist_ok=True) stdout_log_file = os.path.join(log_file_dir, "stdout") stderr_log_file = os.path.join(log_file_dir, "stderr") executable_file = create_executable_wrapper(task) add_on_failure_function(task) with open(stdout_log_file, "w") as stdout_file: with open(stderr_log_file, "w") as stderr_file: return_code = subprocess.call([executable_file], stdout=stdout_file, stderr=stderr_file) if return_code: raise RuntimeError(f"Execution failed with return code {return_code}")
def _create_htcondor_submit_file(self): submit_file_content = [] # Specify where to write the log to log_file_dir = get_log_file_dir(self.task) os.makedirs(log_file_dir, exist_ok=True) stdout_log_file = os.path.abspath(os.path.join(log_file_dir, "stdout")) submit_file_content.append(f"output = {stdout_log_file}") stderr_log_file = os.path.abspath(os.path.join(log_file_dir, "stderr")) submit_file_content.append(f"error = {stderr_log_file}") job_log_file = os.path.abspath(os.path.join(log_file_dir, "job.log")) submit_file_content.append(f"log = {job_log_file}") # Specify the executable executable_file = create_executable_wrapper(self.task) submit_file_content.append( f"executable = {os.path.basename(executable_file)}") # Specify additional settings general_settings = get_setting("htcondor_settings", dict()) try: general_settings.update(self.task.htcondor_settings) except AttributeError: pass transfer_files = get_setting("transfer_files", task=self.task, default=[]) if transfer_files: working_dir = get_setting("working_dir", task=self.task, default="") if not working_dir or working_dir != ".": raise ValueError( "If using transfer_files, the working_dir must be explicitely set to '.'" ) general_settings.setdefault("should_transfer_files", "YES") general_settings.setdefault("when_to_transfer_output", "ON_EXIT") transfer_files = set(transfer_files) for transfer_file in transfer_files: if os.path.abspath(transfer_file) != transfer_file: raise ValueError( "You should only give absolute file names in transfer_files!" + f"{os.path.abspath(transfer_file)} != {transfer_file}") env_setup_script = get_setting("env_script", task=self.task, default="") if env_setup_script: # TODO: make sure to call it relatively transfer_files.add(os.path.abspath(env_setup_script)) general_settings.setdefault("transfer_input_files", ",".join(transfer_files)) job_name = get_setting("job_name", task=self.task, default=False) if job_name is not False: general_settings.setdefault("JobBatchName", job_name) for key, item in general_settings.items(): submit_file_content.append(f"{key} = {item}") # Finally also start the process submit_file_content.append("queue 1") # Now we can write the submit file output_path = get_task_file_dir(self.task) submit_file_path = os.path.join(output_path, "job.submit") os.makedirs(output_path, exist_ok=True) with open(submit_file_path, "w") as submit_file: submit_file.write("\n".join(submit_file_content)) return submit_file_path