def __init__(self, module_file="jobs.py", json_file="jobs.json", error_url=None, log_url=None): self._job_map = {} self.simulation_id = "" self.ert_pid = "" self._log_url = log_url if log_url is None: self._log_url = error_url self._data_root = None self.global_environment = None self.global_update_path = None self.start_time = dt.now() if json_file is not None and os.path.isfile(json_file): self.job_status = ForwardModelStatus("????", self.start_time) self._loadJson(json_file) self.job_status.run_id = self.simulation_id else: raise IOError("'jobs.json' not found.") self.max_runtime = 0 # This option is currently sleeping self.short_sleep = 2 # Sleep between status checks self.node = socket.gethostname() pw_entry = pwd.getpwuid(os.getuid()) self.user = pw_entry.pw_name os_info = _read_os_release() _, _, release, _, _ = os.uname() python_vs, _ = sys_version.split('\n') ecl_v = EclVersion() res_v = ResVersion() logged_fields= {"status": "init", "python_sys_path": map(pad_nonexisting, sys.path), "pythonpath": map(pad_nonexisting, os.environ.get('PYTHONPATH', '').split(':')), "res_version": res_v.versionString(), "ecl_version": ecl_v.versionString(), "LSB_ID": os_info.get('LSB_ID', ''), "LSB_VERSION_ID": os_info.get('LSB_VERSION_ID', ''), "python_version": python_vs, "kernel_version": release, } logged_fields.update({"jobs": self._job_map.values()}) self.postMessage(extra_fields=logged_fields) cond_unlink("EXIT") cond_unlink(self.EXIT_file) cond_unlink(self.STATUS_file) cond_unlink(self.OK_file) self.initStatusFile() if self._data_root: os.environ["DATA_ROOT"] = self._data_root self.set_environment() self.update_path() self.information = logged_fields
def job_progress(self, iens): """Will return a detailed progress of the job. The progress report is obtained by reading a file from the filesystem, that file is typically created by another process running on another machine, and reading might fail due to NFS issues, simultanoues write and so on. If loading valid json fails the function will sleep 0.10 seconds and retry - eventually giving up and returning None. Also for jobs which have not yet started the method will return None. When the method succeeds in reading the progress file from the file system the return value will be an object with properties like this:| progress.start_time progress.end_time progress.run_id progress.jobs =[ (job1.name, job1.start_time, job1.end_time, job1.status, job1.error_msg), (job2.name, job2.start_time, job2.end_time, job2.status, job2.error_msg), .... (jobN.name, jobN.start_time, jobN.end_time, jobN.status, jobN.error_msg) ] """ if not iens in self._run_args: raise KeyError("No such simulation: %s" % iens) run_arg = self._run_args[iens] try: # will throw if not yet submitted (is in a limbo state) queue_index = run_arg.getQueueIndex() except ValueError: return None if self._queue_manager.isJobWaiting(queue_index): return None return ForwardModelStatus.load(run_arg.runpath)
def update_progress_for_index(self, iteration, idx, run_arg): if not self._run_context.is_active(idx): return try: # will throw if not yet submitted (is in a limbo state) queue_index = run_arg.getQueueIndex() except ValueError: return status = None if self._job_queue: status = self._job_queue.getJobStatus(queue_index) # Avoids reading from disk for jobs in these states since there's no # data anyway if status in [ JobStatusType.JOB_QUEUE_PENDING, JobStatusType.JOB_QUEUE_SUBMITTED, JobStatusType.JOB_QUEUE_WAITING, ]: return fms = self.realization_progress[iteration].get(run_arg.iens, None) # Don't load from file if you are finished if fms and BaseRunModel.is_forward_model_finished(fms[0]): jobs = self.realization_progress[iteration][run_arg.iens][0] else: fms = ForwardModelStatus.load(run_arg.runpath, num_retry=1) if not fms: return jobs = fms.jobs self.realization_progress[iteration][run_arg.iens] = jobs, status
def updateDetailedProgress(self): if not self._run_context: return iteration = self._run_context.get_iter() if iteration not in self.realization_progress: self.realization_progress[iteration] = {} for run_arg in self._run_context: if not run_arg: continue try: # will throw if not yet submitted (is in a limbo state) queue_index = run_arg.getQueueIndex() except ValueError: continue fms = self.realization_progress[iteration].get(run_arg.iens, None) if fms and BaseRunModel.is_forward_model_finished(fms): continue fms = ForwardModelStatus.load(run_arg.runpath, num_retry=1) if not fms: continue jobs = fms.jobs self.realization_progress[iteration][run_arg.iens] = jobs
def test_complete(self): with TestAreaContext("json_from_forward_model_NO_DATA_ROOT"): with open("jobs.json", "w") as f: f.write(JSON_STRING_NO_DATA_ROOT) jobm = JobManager() jobm.complete() st = ForwardModelStatus.load(os.getcwd()) self.assertTrue( isinstance(st.end_time, datetime.datetime)) self.assertTrue( isinstance(st.start_time, datetime.datetime)) self.assertTrue( st.end_time >= st.start_time ) dt = datetime.datetime.now() - st.start_time self.assertTrue( dt.total_seconds() < 5 )
def update_progress_for_index(self, iteration: int, idx, run_arg: RunArg) -> None: try: # will throw if not yet submitted (is in a limbo state) queue_index = run_arg.getQueueIndex() except (ValueError, AttributeError): return status = None timed_out = False if self._job_queue: status = self._job_queue.getJobStatus(queue_index) timed_out = self._job_queue.did_job_time_out(queue_index) # Avoids reading from disk for jobs in these states since there's no # data anyway. If timed out, never exit here as that would prevent # propagation of the failure status. if (status in [ JobStatusType.JOB_QUEUE_PENDING, JobStatusType.JOB_QUEUE_SUBMITTED, JobStatusType.JOB_QUEUE_WAITING, ] and not timed_out): return fms = self.realization_progress[iteration].get(run_arg.iens, None) jobs = fms[0] if fms else None # Don't load from file if you are finished if not fms or not BaseRunModel.is_forward_model_finished(fms[0]): loaded = ForwardModelStatus.load(run_arg.runpath, num_retry=1) if not loaded and not timed_out: # If this idx timed out, returning here would prevent # non-successful jobs in being marked as failed (timed out). So # return only in the case where it did not time out. return if loaded: jobs = loaded.jobs if timed_out: for job in jobs: if job.status != "Success": job.error = "The run is cancelled due to reaching MAX_RUNTIME" job.status = "Failure" self.realization_progress[iteration][run_arg.iens] = jobs, status
def updateDetailedProgress(self): if not self._run_context: return iteration = self._run_context.get_iter() if iteration not in self.realization_progress: self.realization_progress[iteration] = {} for idx, run_arg in enumerate(self._run_context): if not self._run_context.is_active(idx): continue try: # will throw if not yet submitted (is in a limbo state) queue_index = run_arg.getQueueIndex() except ValueError: continue status = None if self._job_queue: status = self._job_queue.getJobStatus(queue_index) if status in [ JobStatusType.JOB_QUEUE_PENDING, JobStatusType.JOB_QUEUE_SUBMITTED, JobStatusType.JOB_QUEUE_WAITING ]: continue fms = self.realization_progress[iteration].get(run_arg.iens, None) #Dont load from file if you are finished if fms and BaseRunModel.is_forward_model_finished(fms[0]): jobs = self.realization_progress[iteration][run_arg.iens][0] else: fms = ForwardModelStatus.load(run_arg.runpath, num_retry=1) if not fms: continue jobs = fms.jobs self.realization_progress[iteration][run_arg.iens] = jobs, status
class JobManager(object): LOG_file = "JOB_LOG" EXIT_file = "ERROR" STATUS_file = "STATUS" OK_file = "OK" DEFAULT_UMASK = 0 sleep_time = 10 # Time to sleep before exiting the script - to let the disks sync up. def __init__(self, module_file="jobs.py", json_file="jobs.json", error_url=None, log_url=None): self._job_map = {} self.simulation_id = "" self.ert_pid = "" self._log_url = log_url if log_url is None: self._log_url = error_url self._data_root = None self.global_environment = None self.global_update_path = None self.start_time = dt.now() if json_file is not None and os.path.isfile(json_file): self.job_status = ForwardModelStatus("????", self.start_time) self._loadJson(json_file) self.job_status.run_id = self.simulation_id else: raise IOError("'jobs.json' not found.") self.max_runtime = 0 # This option is currently sleeping self.short_sleep = 2 # Sleep between status checks self.node = socket.gethostname() pw_entry = pwd.getpwuid(os.getuid()) self.user = pw_entry.pw_name os_info = _read_os_release() _, _, release, _, _ = os.uname() python_vs, _ = sys_version.split('\n') ecl_v = EclVersion() res_v = ResVersion() logged_fields = { "status": "init", "python_sys_path": map(pad_nonexisting, sys.path), "pythonpath": map(pad_nonexisting, os.environ.get('PYTHONPATH', '').split(':')), "res_version": res_v.versionString(), "ecl_version": ecl_v.versionString(), "LSB_ID": os_info.get('LSB_ID', ''), "LSB_VERSION_ID": os_info.get('LSB_VERSION_ID', ''), "python_version": python_vs, "kernel_version": release, } logged_fields.update({"jobs": self._job_map.values()}) self.postMessage(extra_fields=logged_fields) cond_unlink("EXIT") cond_unlink(self.EXIT_file) cond_unlink(self.STATUS_file) cond_unlink(self.OK_file) self.initStatusFile() if self._data_root: os.environ["DATA_ROOT"] = self._data_root self.set_environment() self.update_path() self.information = logged_fields def complete(self): self.job_status.complete() def dump_status(self): self.job_status.dump(ForwardModelStatus.STATUS_FILE) def set_environment(self): if self.global_environment: data = self.global_environment for key in data.keys(): os.environ[key] = data[key] def update_path(self): if self.global_update_path: data = self.global_update_path for key in data.keys(): if (os.environ.get(key)): os.environ[key] = data[key] + ':' + os.environ[key] else: os.environ[key] = data[key] def data_root(self): return self._data_root def _loadJson(self, json_file_name): try: with open(json_file_name, "r") as json_file: jobs_data = json.load(json_file) except ValueError as e: raise IOError("Job Manager failed to load JSON-file." + str(e)) self._data_root = jobs_data.get("DATA_ROOT") umask = _jsonGet(jobs_data, "umask") os.umask(int(umask, 8)) if "run_id" in jobs_data: self.simulation_id = _jsonGet(jobs_data, "run_id") os.environ["ERT_RUN_ID"] = self.simulation_id if "ert_pid" in jobs_data: self.ert_pid = _jsonGet(jobs_data, "ert_pid") if "global_environment" in jobs_data: self.global_environment = _jsonGet(jobs_data, "global_environment") if "global_update_path" in jobs_data: self.global_update_path = _jsonGet(jobs_data, "global_update_path") self.job_list = _jsonGet(jobs_data, "jobList") self._ensureCompatibleJobList() self._buildJobMap() for job in self.job_list: self.job_status.add_job(ForwardModelJobStatus(job.get("name"))) # "Monkey-patching" the job object by attaching a status object. status_list = self.job_status.jobs for i in range(len(self.job_list)): self.job_list[i]["status"] = status_list[i] # To ensure compatibility with old versions. def _ensureCompatibleJobList(self): for job in self.job_list: if not "max_running_minutes" in job.keys(): job["max_running_minutes"] = None def _buildJobMap(self): self._job_map = {} for index, job in enumerate(self.job_list): self._job_map[job["name"]] = job if "stderr" in job: if job["stderr"]: job["stderr"] = "%s.%d" % (job["stderr"], index) if "stdout" in job: if job["stdout"]: job["stdout"] = "%s.%d" % (job["stdout"], index) def __contains__(self, key): return key in self._job_map def __len__(self): return len(self.job_list) def __repr__(self): st = self.start_time node = self.node us = self.user cnt = 'len=%d, start=%s, node=%s, user=%s' cnt = cnt % (len(self), st, node, us) return 'JobManager(%s)' % cnt def __getitem__(self, index): if isinstance(index, int): return self.job_list[index] else: return self._job_map[index] def initStatusFile(self): with open(self.STATUS_file, "a") as f: f.write("%-32s: %s/%s\n" % ("Current host", self.node, os.uname()[4])) if "LSF_JOBID" in os.environ: f.write("LSF JOBID: %s\n" % os.environ.get("LSF_JOBID")) else: f.write("LSF JOBID: not running LSF\n") def startStatus(self, job): with open(self.STATUS_file, "a") as f: now = time.localtime() f.write("%-32s: %02d:%02d:%02d .... " % (job["name"], now.tm_hour, now.tm_min, now.tm_sec)) def completeStatus(self, exit_status, error_msg, job=None): now = time.localtime() extra_fields = { "finished": True, "exit_status": exit_status, "status": "completeStatus" } with open(self.STATUS_file, "a") as f: if exit_status == 0: status = "" else: status = " EXIT: %d/%s" % (exit_status, error_msg) extra_fields.update({"error_msg": error_msg}) f.write("%02d:%02d:%02d %s\n" % (now.tm_hour, now.tm_min, now.tm_sec, status)) def createOKFile(self): now = time.localtime() with open(self.OK_file, "w") as f: f.write("All jobs complete %02d:%02d:%02d \n" % (now.tm_hour, now.tm_min, now.tm_sec)) self.postMessage(extra_fields={"status": "OK"}) time.sleep(self.sleep_time) # Let the disks sync up def getStartTime(self): return self.start_time def getRuntime(self): rt = dt.now() - self.start_time return rt.total_seconds() def assertArgList(self, job): if "arg_types" in job: argTypes = job["arg_types"] argList = job.get("argList") for index, arg_type in enumerate(argTypes): if (arg_type == "RUNTIME_FILE"): file_path = os.path.join(os.getcwd(), argList[index]) if not os.path.isfile(file_path): raise TypeError( "In job \"%s\": RUNTIME_FILE \"%s\" does not exist." % (job["name"], argList[index])) if (arg_type == "RUNTIME_INT"): try: int(argList[index]) except ValueError: raise ValueError( "In job \"%s\": argument with index %d is of incorrect type, should be integer." % (job["name"], index)) def execJob(self, job): executable = job.get('executable') assert_file_executable(executable) start_time = time.time() if job.get("stdin"): redirect_input(job["stdin"], 0) if job.get("stdout"): redirect_output(job["stdout"], 1, start_time) if job.get("stderr"): redirect_output(job["stderr"], 2, start_time) if job.get("environment"): env = job["environment"] for key in env.keys(): os.putenv(key, env[key]) self.assertArgList(job) argList = [executable] if job.get("argList"): argList += job["argList"] os.execvp(executable, argList) def jobProcess(self, job): executable = job.get('executable') assert_file_executable(executable) argList = [executable] if job.get("argList"): argList += job["argList"] if job.get("stdin"): stdin = open(job.get("stdin")) else: stdin = None if job.get("stderr"): stderr = open(job.get("stderr"), "w") else: stderr = None if job.get("stdout"): stdout = open(job.get("stdout"), "w") else: stdout = None P = subprocess.Popen(argList, stdin=stdin, stdout=stdout, stderr=stderr, env=job.get("environment")) return P def postMessage(self, job=None, extra_fields={}, url=None): if url is None: url = self._log_url if job: job_fields = { "ert_job": job["name"], "executable": job["executable"], "arg_list": " ".join(job["argList"]) } job_fields.update(extra_fields) extra_fields = job_fields payload = { "user": self.user, "cwd": os.getcwd(), "application": "ert", "subsystem": "ert_forward_model", "node": self.node, "start_time": self.start_time.isoformat(), "node_timestamp": dt.now().isoformat(), "simulation_id": self.simulation_id, "ert_pid": self.ert_pid } payload.update(extra_fields) try: if url is None: sys.stderr.write( '\nWARNING: LOG/ERROR URL NOT CONFIGURED.\n\n') sys.stderr.write(json.dumps(payload)) sys.stderr.write('\nAbove error log NOT submitted.') sys.stderr.flush() else: data = json.dumps(payload) #Disabling proxies proxies = { "http": None, "https": None, } res = requests.post( url, timeout=3, headers={"Content-Type": "application/json"}, data=data, proxies=proxies) # sys.stdout.write("Response status %s\n"%res.status_code) # sys.stdout.write("Request url %s\n"%res.url) # sys.stdout.write("Response headers %s\n"%res.headers) # sys.stdout.write("Response content %s\n"%res.content) # sys.stdout.write("Writing payload: %s\n"%payload) # sys.stdout.write("Writing data: %s\n"%data) except: pass def postError(self, job, error_msg): extra_fields = self.extract_stderr_stdout(job) extra_fields.update({"status": "error", "finished": True}) self.postMessage(job, extra_fields, url=self._log_url) def extract_stderr_stdout(self, job): extra_fields = {} if job.get("stderr"): if os.path.exists(job["stderr"]): with open(job["stderr"], "r") as errH: stderr = errH.read() extra_fields.update({"stderr": stderr}) if job.get("stdout"): if os.path.exists(job["stdout"]): with open(job["stdout"], "r") as outH: stdout = outH.read() extra_fields.update({"stdout": stdout}) return extra_fields def exit(self, job, exit_status, error_msg): self.dump_EXIT_file(job, error_msg) std_err_out = self.extract_stderr_stdout(job) std_err_out.update({ "status": "exit", "finished": True, "error_msg": error_msg, "exit_status": exit_status, "error": True }) self.postMessage(job=job, extra_fields=std_err_out) #Posts to new logstash pgid = os.getpgid(os.getpid()) os.killpg(pgid, signal.SIGKILL) def addLogLine(self, job): now = time.localtime() with open(self.LOG_file, "a") as f: args = " ".join(job["argList"]) f.write("%02d:%02d:%02d Calling: %s %s\n" % (now.tm_hour, now.tm_min, now.tm_sec, job.get('executable'), args)) def runJob(self, job): assert_file_executable(job.get('executable')) self.addLogLine(job) status = job["status"] status.start_time = dt.now() status.status = "Running" self.job_status.dump() exec_env = job.get("exec_env") if exec_env: with open("%s_exec_env.json" % job.get("name"), "w") as f: f.write(json.dumps(exec_env)) pid = os.fork() exit_status, err_msg = 0, '' if pid == 0: # This code block should exec into the actual executable we are # running, and execution should not come back here. However - if # the code fails with an exception before actually reaching the # exec() call we suddenly have two Python processes running the # current code; one waiting for the exit status and one unrolling # an exception. The latter will incorrectly "steal" the # finalization of with statements. So - in the case of an exception # before the exec() call we call the hard exit: os._exit(1). try: self.execJob(job) except Exception as e: sys.stderr.write("Failed to exec:%s error:%s\n" % (job["name"], str(e))) os._exit(1) else: _, exit_status = os.waitpid(pid, 0) # The exit_status returned from os.waitpid encodes # both the exit status of the external application, # and in case the job was killed by a signal - the # number of that signal. exit_status = os.WEXITSTATUS(exit_status) status.end_time = dt.now() if exit_status != 0: err_msg = "Executable: %s failed with exit code: %s" % ( job.get('executable'), exit_status) status.status = "Failure" status.error = err_msg else: status.status = "Success" self.job_status.dump() return exit_status, err_msg @staticmethod def mountPoint(path): """Calls `mount`, finds line corresponding to given path, returns addr part.""" mount_stdout = subprocess.check_output(["mount"]).strip().split('\n') for line in mount_stdout: tmp = line.split() if tmp[2] == path: cnt = tmp[5][ 1:-1] # was '(rw,...,addr=...)' is now 'rw,...,addr=...,' d = dict([ tuple(x.split('=')) if '=' in x else (x, True) for x in cnt.split(',') ]) if 'addr' in d: isilon_node = d['addr'] elif 'mountaddr' in d: isilon_node = d['mountaddr'] server_tmp = tmp[0].split(":") if len(server_tmp) == 1: file_server = "local" else: file_server = server_tmp[0] return (file_server, isilon_node) return ('?', '?.?.?.?') # This file will be read by the job_queue_node_fscanf_EXIT() function # in job_queue.c. Be very careful with changes in output format. def dump_EXIT_file(self, job, error_msg): fileH = open(self.EXIT_file, "a") now = time.localtime() fileH.write("<error>\n") fileH.write(" <time>%02d:%02d:%02d</time>\n" % (now.tm_hour, now.tm_min, now.tm_sec)) fileH.write(" <job>%s</job>\n" % job["name"]) fileH.write(" <reason>%s</reason>\n" % error_msg) stderr_file = None if job["stderr"]: if os.path.exists(job["stderr"]): with open(job["stderr"], "r") as errH: stderr = errH.read() if stderr: stderr_file = os.path.join(os.getcwd(), job["stderr"]) else: stderr = "<Not written by:%s>\n" % job["name"] else: stderr = "<stderr: Could not find file:%s>\n" % job["stderr"] else: stderr = "<stderr: Not redirected>\n" fileH.write(" <stderr>\n%s</stderr>\n" % stderr) if stderr_file: fileH.write(" <stderr_file>%s</stderr_file>\n" % stderr_file) fileH.write("</error>\n") fileH.close() # Have renamed the exit file from "EXIT" to "ERROR"; # must keep the old "EXIT" file around until all old ert versions # are flushed out. shutil.copyfile(self.EXIT_file, "EXIT")