Ejemplo n.º 1
0
    def __init__(self, module_file="jobs.py", json_file="jobs.json", error_url=None, log_url=None):
        self._job_map = {}
        self.simulation_id = ""
        self.ert_pid = ""
        self._log_url = log_url
        if log_url is None:
            self._log_url = error_url
        self._data_root = None
        self.global_environment = None
        self.global_update_path = None
        self.start_time = dt.now()
        if json_file is not None and os.path.isfile(json_file):
            self.job_status = ForwardModelStatus("????", self.start_time)
            self._loadJson(json_file)
            self.job_status.run_id = self.simulation_id
        else:
            raise IOError("'jobs.json' not found.")

        self.max_runtime = 0  # This option is currently sleeping
        self.short_sleep = 2  # Sleep between status checks
        self.node = socket.gethostname()
        pw_entry = pwd.getpwuid(os.getuid())
        self.user = pw_entry.pw_name
        os_info = _read_os_release()
        _, _, release, _, _ = os.uname()
        python_vs, _ = sys_version.split('\n')
        ecl_v = EclVersion()
        res_v = ResVersion()
        logged_fields= {"status": "init",
                        "python_sys_path": map(pad_nonexisting, sys.path),
                        "pythonpath": map(pad_nonexisting, os.environ.get('PYTHONPATH', '').split(':')),
                        "res_version": res_v.versionString(),
                        "ecl_version": ecl_v.versionString(),
                        "LSB_ID": os_info.get('LSB_ID', ''),
                        "LSB_VERSION_ID": os_info.get('LSB_VERSION_ID', ''),
                        "python_version": python_vs,
                        "kernel_version": release,
                        }
        logged_fields.update({"jobs": self._job_map.values()})
        self.postMessage(extra_fields=logged_fields)
        cond_unlink("EXIT")
        cond_unlink(self.EXIT_file)
        cond_unlink(self.STATUS_file)
        cond_unlink(self.OK_file)
        self.initStatusFile()
        if self._data_root:
            os.environ["DATA_ROOT"] = self._data_root
        self.set_environment()
        self.update_path()
        self.information = logged_fields
Ejemplo n.º 2
0
    def job_progress(self, iens):
        """Will return a detailed progress of the job.

        The progress report is obtained by reading a file from the filesystem,
        that file is typically created by another process running on another
        machine, and reading might fail due to NFS issues, simultanoues write
        and so on. If loading valid json fails the function will sleep 0.10
        seconds and retry - eventually giving up and returning None. Also for
        jobs which have not yet started the method will return None.

        When the method succeeds in reading the progress file from the file
        system the return value will be an object with properties like this:|

           progress.start_time
           progress.end_time
           progress.run_id
           progress.jobs =[ (job1.name, job1.start_time, job1.end_time, job1.status, job1.error_msg),
                             (job2.name, job2.start_time, job2.end_time, job2.status, job2.error_msg),
                              ....
                             (jobN.name, jobN.start_time, jobN.end_time, jobN.status, jobN.error_msg) ]

        """
        if not iens in self._run_args:
            raise KeyError("No such simulation: %s" % iens)

        run_arg = self._run_args[iens]
        try:
            # will throw if not yet submitted (is in a limbo state)
            queue_index = run_arg.getQueueIndex()
        except ValueError:
            return None
        if self._queue_manager.isJobWaiting(queue_index):
            return None

        return ForwardModelStatus.load(run_arg.runpath)
Ejemplo n.º 3
0
    def update_progress_for_index(self, iteration, idx, run_arg):
        if not self._run_context.is_active(idx):
            return
        try:
            # will throw if not yet submitted (is in a limbo state)
            queue_index = run_arg.getQueueIndex()
        except ValueError:
            return

        status = None
        if self._job_queue:
            status = self._job_queue.getJobStatus(queue_index)

        # Avoids reading from disk for jobs in these states since there's no
        # data anyway
        if status in [
                JobStatusType.JOB_QUEUE_PENDING,
                JobStatusType.JOB_QUEUE_SUBMITTED,
                JobStatusType.JOB_QUEUE_WAITING,
        ]:
            return

        fms = self.realization_progress[iteration].get(run_arg.iens, None)

        # Don't load from file if you are finished
        if fms and BaseRunModel.is_forward_model_finished(fms[0]):
            jobs = self.realization_progress[iteration][run_arg.iens][0]
        else:
            fms = ForwardModelStatus.load(run_arg.runpath, num_retry=1)
            if not fms:
                return

            jobs = fms.jobs
        self.realization_progress[iteration][run_arg.iens] = jobs, status
Ejemplo n.º 4
0
    def updateDetailedProgress(self):
        if not self._run_context:
            return

        iteration = self._run_context.get_iter()
        if iteration not in self.realization_progress:
            self.realization_progress[iteration] = {}
        for run_arg in self._run_context:
            if not run_arg:
                continue
            try:
                # will throw if not yet submitted (is in a limbo state)
                queue_index = run_arg.getQueueIndex()
            except ValueError:
                continue

            fms = self.realization_progress[iteration].get(run_arg.iens, None)
            if fms and BaseRunModel.is_forward_model_finished(fms):
                continue

            fms = ForwardModelStatus.load(run_arg.runpath, num_retry=1)

            if not fms:
                continue

            jobs = fms.jobs
            self.realization_progress[iteration][run_arg.iens] = jobs
Ejemplo n.º 5
0
    def test_complete(self):
        with TestAreaContext("json_from_forward_model_NO_DATA_ROOT"):
            with open("jobs.json", "w") as f:
                f.write(JSON_STRING_NO_DATA_ROOT)

            jobm = JobManager()
            jobm.complete()
            st = ForwardModelStatus.load(os.getcwd())
            self.assertTrue( isinstance(st.end_time, datetime.datetime))
            self.assertTrue( isinstance(st.start_time, datetime.datetime))
            self.assertTrue( st.end_time >= st.start_time )
            dt = datetime.datetime.now() - st.start_time
            self.assertTrue( dt.total_seconds() < 5 )
Ejemplo n.º 6
0
    def update_progress_for_index(self, iteration: int, idx,
                                  run_arg: RunArg) -> None:
        try:
            # will throw if not yet submitted (is in a limbo state)
            queue_index = run_arg.getQueueIndex()
        except (ValueError, AttributeError):
            return

        status = None
        timed_out = False
        if self._job_queue:
            status = self._job_queue.getJobStatus(queue_index)
            timed_out = self._job_queue.did_job_time_out(queue_index)

        # Avoids reading from disk for jobs in these states since there's no
        # data anyway. If timed out, never exit here as that would prevent
        # propagation of the failure status.
        if (status in [
                JobStatusType.JOB_QUEUE_PENDING,
                JobStatusType.JOB_QUEUE_SUBMITTED,
                JobStatusType.JOB_QUEUE_WAITING,
        ] and not timed_out):
            return

        fms = self.realization_progress[iteration].get(run_arg.iens, None)
        jobs = fms[0] if fms else None

        # Don't load from file if you are finished
        if not fms or not BaseRunModel.is_forward_model_finished(fms[0]):
            loaded = ForwardModelStatus.load(run_arg.runpath, num_retry=1)
            if not loaded and not timed_out:
                # If this idx timed out, returning here would prevent
                # non-successful jobs in being marked as failed (timed out). So
                # return only in the case where it did not time out.
                return

            if loaded:
                jobs = loaded.jobs

        if timed_out:
            for job in jobs:
                if job.status != "Success":
                    job.error = "The run is cancelled due to reaching MAX_RUNTIME"
                    job.status = "Failure"
        self.realization_progress[iteration][run_arg.iens] = jobs, status
Ejemplo n.º 7
0
    def updateDetailedProgress(self):
        if not self._run_context:
            return

        iteration = self._run_context.get_iter()
        if iteration not in self.realization_progress:
            self.realization_progress[iteration] = {}
        for idx, run_arg in enumerate(self._run_context):
            if not self._run_context.is_active(idx):
                continue
            try:
                # will throw if not yet submitted (is in a limbo state)
                queue_index = run_arg.getQueueIndex()
            except ValueError:
                continue

            status = None
            if self._job_queue:
                status = self._job_queue.getJobStatus(queue_index)

            if status in [
                    JobStatusType.JOB_QUEUE_PENDING,
                    JobStatusType.JOB_QUEUE_SUBMITTED,
                    JobStatusType.JOB_QUEUE_WAITING
                    ]:
                continue

            fms = self.realization_progress[iteration].get(run_arg.iens, None)

            #Dont load from file if you are finished
            if fms and BaseRunModel.is_forward_model_finished(fms[0]):
                jobs = self.realization_progress[iteration][run_arg.iens][0]
            else:
                fms = ForwardModelStatus.load(run_arg.runpath, num_retry=1)
                if not fms:
                    continue

                jobs = fms.jobs
            self.realization_progress[iteration][run_arg.iens] = jobs, status
Ejemplo n.º 8
0
class JobManager(object):
    LOG_file = "JOB_LOG"
    EXIT_file = "ERROR"
    STATUS_file = "STATUS"
    OK_file = "OK"

    DEFAULT_UMASK = 0
    sleep_time = 10  # Time to sleep before exiting the script - to let the disks sync up.

    def __init__(self,
                 module_file="jobs.py",
                 json_file="jobs.json",
                 error_url=None,
                 log_url=None):
        self._job_map = {}
        self.simulation_id = ""
        self.ert_pid = ""
        self._log_url = log_url
        if log_url is None:
            self._log_url = error_url
        self._data_root = None
        self.global_environment = None
        self.global_update_path = None
        self.start_time = dt.now()
        if json_file is not None and os.path.isfile(json_file):
            self.job_status = ForwardModelStatus("????", self.start_time)
            self._loadJson(json_file)
            self.job_status.run_id = self.simulation_id
        else:
            raise IOError("'jobs.json' not found.")

        self.max_runtime = 0  # This option is currently sleeping
        self.short_sleep = 2  # Sleep between status checks
        self.node = socket.gethostname()
        pw_entry = pwd.getpwuid(os.getuid())
        self.user = pw_entry.pw_name
        os_info = _read_os_release()
        _, _, release, _, _ = os.uname()
        python_vs, _ = sys_version.split('\n')
        ecl_v = EclVersion()
        res_v = ResVersion()
        logged_fields = {
            "status":
            "init",
            "python_sys_path":
            map(pad_nonexisting, sys.path),
            "pythonpath":
            map(pad_nonexisting,
                os.environ.get('PYTHONPATH', '').split(':')),
            "res_version":
            res_v.versionString(),
            "ecl_version":
            ecl_v.versionString(),
            "LSB_ID":
            os_info.get('LSB_ID', ''),
            "LSB_VERSION_ID":
            os_info.get('LSB_VERSION_ID', ''),
            "python_version":
            python_vs,
            "kernel_version":
            release,
        }
        logged_fields.update({"jobs": self._job_map.values()})
        self.postMessage(extra_fields=logged_fields)
        cond_unlink("EXIT")
        cond_unlink(self.EXIT_file)
        cond_unlink(self.STATUS_file)
        cond_unlink(self.OK_file)
        self.initStatusFile()
        if self._data_root:
            os.environ["DATA_ROOT"] = self._data_root
        self.set_environment()
        self.update_path()
        self.information = logged_fields

    def complete(self):
        self.job_status.complete()

    def dump_status(self):
        self.job_status.dump(ForwardModelStatus.STATUS_FILE)

    def set_environment(self):
        if self.global_environment:
            data = self.global_environment
            for key in data.keys():
                os.environ[key] = data[key]

    def update_path(self):
        if self.global_update_path:
            data = self.global_update_path
            for key in data.keys():
                if (os.environ.get(key)):
                    os.environ[key] = data[key] + ':' + os.environ[key]
                else:
                    os.environ[key] = data[key]

    def data_root(self):
        return self._data_root

    def _loadJson(self, json_file_name):
        try:
            with open(json_file_name, "r") as json_file:
                jobs_data = json.load(json_file)
        except ValueError as e:
            raise IOError("Job Manager failed to load JSON-file." + str(e))

        self._data_root = jobs_data.get("DATA_ROOT")
        umask = _jsonGet(jobs_data, "umask")
        os.umask(int(umask, 8))
        if "run_id" in jobs_data:
            self.simulation_id = _jsonGet(jobs_data, "run_id")
            os.environ["ERT_RUN_ID"] = self.simulation_id
        if "ert_pid" in jobs_data:
            self.ert_pid = _jsonGet(jobs_data, "ert_pid")
        if "global_environment" in jobs_data:
            self.global_environment = _jsonGet(jobs_data, "global_environment")
        if "global_update_path" in jobs_data:
            self.global_update_path = _jsonGet(jobs_data, "global_update_path")
        self.job_list = _jsonGet(jobs_data, "jobList")
        self._ensureCompatibleJobList()
        self._buildJobMap()

        for job in self.job_list:
            self.job_status.add_job(ForwardModelJobStatus(job.get("name")))

        # "Monkey-patching" the job object by attaching a status object.
        status_list = self.job_status.jobs
        for i in range(len(self.job_list)):
            self.job_list[i]["status"] = status_list[i]

    # To ensure compatibility with old versions.
    def _ensureCompatibleJobList(self):
        for job in self.job_list:
            if not "max_running_minutes" in job.keys():
                job["max_running_minutes"] = None

    def _buildJobMap(self):
        self._job_map = {}
        for index, job in enumerate(self.job_list):
            self._job_map[job["name"]] = job
            if "stderr" in job:
                if job["stderr"]:
                    job["stderr"] = "%s.%d" % (job["stderr"], index)

            if "stdout" in job:
                if job["stdout"]:
                    job["stdout"] = "%s.%d" % (job["stdout"], index)

    def __contains__(self, key):
        return key in self._job_map

    def __len__(self):
        return len(self.job_list)

    def __repr__(self):
        st = self.start_time
        node = self.node
        us = self.user
        cnt = 'len=%d, start=%s, node=%s, user=%s'
        cnt = cnt % (len(self), st, node, us)
        return 'JobManager(%s)' % cnt

    def __getitem__(self, index):
        if isinstance(index, int):
            return self.job_list[index]
        else:
            return self._job_map[index]

    def initStatusFile(self):
        with open(self.STATUS_file, "a") as f:
            f.write("%-32s: %s/%s\n" %
                    ("Current host", self.node, os.uname()[4]))
            if "LSF_JOBID" in os.environ:
                f.write("LSF JOBID: %s\n" % os.environ.get("LSF_JOBID"))
            else:
                f.write("LSF JOBID: not running LSF\n")

    def startStatus(self, job):
        with open(self.STATUS_file, "a") as f:
            now = time.localtime()
            f.write("%-32s: %02d:%02d:%02d .... " %
                    (job["name"], now.tm_hour, now.tm_min, now.tm_sec))

    def completeStatus(self, exit_status, error_msg, job=None):
        now = time.localtime()
        extra_fields = {
            "finished": True,
            "exit_status": exit_status,
            "status": "completeStatus"
        }
        with open(self.STATUS_file, "a") as f:
            if exit_status == 0:
                status = ""
            else:
                status = " EXIT: %d/%s" % (exit_status, error_msg)
                extra_fields.update({"error_msg": error_msg})

            f.write("%02d:%02d:%02d  %s\n" %
                    (now.tm_hour, now.tm_min, now.tm_sec, status))

    def createOKFile(self):
        now = time.localtime()
        with open(self.OK_file, "w") as f:
            f.write("All jobs complete %02d:%02d:%02d \n" %
                    (now.tm_hour, now.tm_min, now.tm_sec))
        self.postMessage(extra_fields={"status": "OK"})
        time.sleep(self.sleep_time)  # Let the disks sync up

    def getStartTime(self):
        return self.start_time

    def getRuntime(self):
        rt = dt.now() - self.start_time
        return rt.total_seconds()

    def assertArgList(self, job):
        if "arg_types" in job:
            argTypes = job["arg_types"]
            argList = job.get("argList")
            for index, arg_type in enumerate(argTypes):
                if (arg_type == "RUNTIME_FILE"):
                    file_path = os.path.join(os.getcwd(), argList[index])
                    if not os.path.isfile(file_path):
                        raise TypeError(
                            "In job \"%s\": RUNTIME_FILE \"%s\" does not exist."
                            % (job["name"], argList[index]))
                if (arg_type == "RUNTIME_INT"):
                    try:
                        int(argList[index])
                    except ValueError:
                        raise ValueError(
                            "In job \"%s\": argument with index %d is of incorrect type, should be integer."
                            % (job["name"], index))

    def execJob(self, job):
        executable = job.get('executable')
        assert_file_executable(executable)

        start_time = time.time()
        if job.get("stdin"):
            redirect_input(job["stdin"], 0)

        if job.get("stdout"):
            redirect_output(job["stdout"], 1, start_time)

        if job.get("stderr"):
            redirect_output(job["stderr"], 2, start_time)

        if job.get("environment"):
            env = job["environment"]
            for key in env.keys():
                os.putenv(key, env[key])

        self.assertArgList(job)

        argList = [executable]
        if job.get("argList"):
            argList += job["argList"]

        os.execvp(executable, argList)

    def jobProcess(self, job):
        executable = job.get('executable')
        assert_file_executable(executable)

        argList = [executable]
        if job.get("argList"):
            argList += job["argList"]

        if job.get("stdin"):
            stdin = open(job.get("stdin"))
        else:
            stdin = None

        if job.get("stderr"):
            stderr = open(job.get("stderr"), "w")
        else:
            stderr = None

        if job.get("stdout"):
            stdout = open(job.get("stdout"), "w")
        else:
            stdout = None

        P = subprocess.Popen(argList,
                             stdin=stdin,
                             stdout=stdout,
                             stderr=stderr,
                             env=job.get("environment"))

        return P

    def postMessage(self, job=None, extra_fields={}, url=None):
        if url is None:
            url = self._log_url
        if job:
            job_fields = {
                "ert_job": job["name"],
                "executable": job["executable"],
                "arg_list": " ".join(job["argList"])
            }
            job_fields.update(extra_fields)
            extra_fields = job_fields

        payload = {
            "user": self.user,
            "cwd": os.getcwd(),
            "application": "ert",
            "subsystem": "ert_forward_model",
            "node": self.node,
            "start_time": self.start_time.isoformat(),
            "node_timestamp": dt.now().isoformat(),
            "simulation_id": self.simulation_id,
            "ert_pid": self.ert_pid
        }
        payload.update(extra_fields)
        try:
            if url is None:
                sys.stderr.write(
                    '\nWARNING: LOG/ERROR URL NOT CONFIGURED.\n\n')
                sys.stderr.write(json.dumps(payload))
                sys.stderr.write('\nAbove error log NOT submitted.')
                sys.stderr.flush()
            else:
                data = json.dumps(payload)
                #Disabling proxies
                proxies = {
                    "http": None,
                    "https": None,
                }
                res = requests.post(
                    url,
                    timeout=3,
                    headers={"Content-Type": "application/json"},
                    data=data,
                    proxies=proxies)
                # sys.stdout.write("Response status %s\n"%res.status_code)
                # sys.stdout.write("Request url %s\n"%res.url)
                # sys.stdout.write("Response headers %s\n"%res.headers)
                # sys.stdout.write("Response content %s\n"%res.content)
                # sys.stdout.write("Writing payload: %s\n"%payload)
                # sys.stdout.write("Writing data: %s\n"%data)
        except:
            pass

    def postError(self, job, error_msg):
        extra_fields = self.extract_stderr_stdout(job)
        extra_fields.update({"status": "error", "finished": True})
        self.postMessage(job, extra_fields, url=self._log_url)

    def extract_stderr_stdout(self, job):
        extra_fields = {}
        if job.get("stderr"):
            if os.path.exists(job["stderr"]):
                with open(job["stderr"], "r") as errH:
                    stderr = errH.read()
                    extra_fields.update({"stderr": stderr})
        if job.get("stdout"):
            if os.path.exists(job["stdout"]):
                with open(job["stdout"], "r") as outH:
                    stdout = outH.read()
                    extra_fields.update({"stdout": stdout})
        return extra_fields

    def exit(self, job, exit_status, error_msg):
        self.dump_EXIT_file(job, error_msg)
        std_err_out = self.extract_stderr_stdout(job)
        std_err_out.update({
            "status": "exit",
            "finished": True,
            "error_msg": error_msg,
            "exit_status": exit_status,
            "error": True
        })
        self.postMessage(job=job,
                         extra_fields=std_err_out)  #Posts to new logstash
        pgid = os.getpgid(os.getpid())
        os.killpg(pgid, signal.SIGKILL)

    def addLogLine(self, job):
        now = time.localtime()
        with open(self.LOG_file, "a") as f:
            args = " ".join(job["argList"])
            f.write("%02d:%02d:%02d  Calling: %s %s\n" %
                    (now.tm_hour, now.tm_min, now.tm_sec,
                     job.get('executable'), args))

    def runJob(self, job):
        assert_file_executable(job.get('executable'))
        self.addLogLine(job)

        status = job["status"]
        status.start_time = dt.now()
        status.status = "Running"
        self.job_status.dump()

        exec_env = job.get("exec_env")
        if exec_env:
            with open("%s_exec_env.json" % job.get("name"), "w") as f:
                f.write(json.dumps(exec_env))

        pid = os.fork()
        exit_status, err_msg = 0, ''
        if pid == 0:
            # This code block should exec into the actual executable we are
            # running, and execution should not come back here. However - if
            # the code fails with an exception before actually reaching the
            # exec() call we suddenly have two Python processes running the
            # current code; one waiting for the exit status and one unrolling
            # an exception. The latter will incorrectly "steal" the
            # finalization of with statements. So - in the case of an exception
            # before the exec() call we call the hard exit: os._exit(1).
            try:
                self.execJob(job)
            except Exception as e:
                sys.stderr.write("Failed to exec:%s error:%s\n" %
                                 (job["name"], str(e)))
                os._exit(1)
        else:
            _, exit_status = os.waitpid(pid, 0)
            # The exit_status returned from os.waitpid encodes
            # both the exit status of the external application,
            # and in case the job was killed by a signal - the
            # number of that signal.
            exit_status = os.WEXITSTATUS(exit_status)

        status.end_time = dt.now()

        if exit_status != 0:
            err_msg = "Executable: %s failed with exit code: %s" % (
                job.get('executable'), exit_status)

            status.status = "Failure"
            status.error = err_msg
        else:
            status.status = "Success"

        self.job_status.dump()
        return exit_status, err_msg

    @staticmethod
    def mountPoint(path):
        """Calls `mount`, finds line corresponding to given path, returns addr part."""
        mount_stdout = subprocess.check_output(["mount"]).strip().split('\n')
        for line in mount_stdout:
            tmp = line.split()
            if tmp[2] == path:
                cnt = tmp[5][
                    1:-1]  # was '(rw,...,addr=...)' is now 'rw,...,addr=...,'
                d = dict([
                    tuple(x.split('=')) if '=' in x else (x, True)
                    for x in cnt.split(',')
                ])
                if 'addr' in d:
                    isilon_node = d['addr']
                elif 'mountaddr' in d:
                    isilon_node = d['mountaddr']

                server_tmp = tmp[0].split(":")
                if len(server_tmp) == 1:
                    file_server = "local"
                else:
                    file_server = server_tmp[0]

                return (file_server, isilon_node)

        return ('?', '?.?.?.?')

    # This file will be read by the job_queue_node_fscanf_EXIT() function
    # in job_queue.c. Be very careful with changes in output format.
    def dump_EXIT_file(self, job, error_msg):
        fileH = open(self.EXIT_file, "a")
        now = time.localtime()
        fileH.write("<error>\n")
        fileH.write("  <time>%02d:%02d:%02d</time>\n" %
                    (now.tm_hour, now.tm_min, now.tm_sec))
        fileH.write("  <job>%s</job>\n" % job["name"])
        fileH.write("  <reason>%s</reason>\n" % error_msg)
        stderr_file = None
        if job["stderr"]:
            if os.path.exists(job["stderr"]):
                with open(job["stderr"], "r") as errH:
                    stderr = errH.read()
                    if stderr:
                        stderr_file = os.path.join(os.getcwd(), job["stderr"])
                    else:
                        stderr = "<Not written by:%s>\n" % job["name"]
            else:
                stderr = "<stderr: Could not find file:%s>\n" % job["stderr"]
        else:
            stderr = "<stderr: Not redirected>\n"

        fileH.write("  <stderr>\n%s</stderr>\n" % stderr)
        if stderr_file:
            fileH.write("  <stderr_file>%s</stderr_file>\n" % stderr_file)

        fileH.write("</error>\n")
        fileH.close()

        # Have renamed the exit file from "EXIT" to "ERROR";
        # must keep the old "EXIT" file around until all old ert versions
        # are flushed out.
        shutil.copyfile(self.EXIT_file, "EXIT")