Esempio n. 1
0
    def update(self):
        # What's done is done
        if self.status == 'JOB_TERMINATED':
            return

        # Updating works by opening the log file,
        # Looping through and only keeping the last event,
        # which really tells us what's going on.
        # This is not very efficient, so an alternative
        # implementation would be welcome
        jel = htcondor.JobEventLog(self._log)

        first = None
        try:
            for event in jel.events(stop_after=0):
                if not first:
                    first = event
                latest = event
            try:
                self._code = latest["ReturnValue"]
            except KeyError:
                self._code = "-"
            self.status = str(htcondor.JobEventType.values[latest.type])
            self.cluster = latest.cluster
            self.runtime = latest.timestamp - first.timestamp
        except OSError:
            self.code = "-"
            self.status = "NOPARSE"
            self.cluster = "-"
            self.runtime = -1
        finally:
            jel.close()
Esempio n. 2
0
    def __init__(
        self,
        function,
        input_file,
        working_dir = None
    ):
        self.function = function
        self.input_file = Path(input_file)

        if working_dir is None:
            working_dir = Path.cwd()

        self.uid = uuid.uuid4()

        self.working_dir = working_dir
        self.working_dir.mkdir(parents = True, exist_ok = True)
        self._event_log_path = self.working_dir / f'{self.uid}.log'
        self._event_log_path.touch(exist_ok = True)
        self._events = htcondor.JobEventLog(self._event_log_path.as_posix()).events(0)

        self._output_file_path = self.working_dir / f'{self.uid}.output'

        self._state = TaskState.Unsubmitted
        self._jobid = None

        TASKS.add(self)
Esempio n. 3
0
    def update(self):
        if self.status == 'JOB_TERMINATED':
            return
        
        # For updaing, open the log file
        # and look at the status of the 
        # last event.
        jel = htcondor.JobEventLog(self._log)

        first = None
        try:
            for event in jel.events(stop_after=0):
                if not first:
                    first = event
                latest = event
            try:
                self.code = latest['ReturnValue']
            except KeyError:
                self.code = '-'
            self.status = str(htcondor.JobEventType.values[latest.type])
            self.cluster = latest.cluster
            self.runtime = latest.timestamp - first.timestamp

        except OSError:
            self.code = '-'
            self.status = 'NOPARSE'
            self.cluster = '-'
            self.runtime = -1

        finally:
            jel.close()
Esempio n. 4
0
    def _update(self):
        logger.debug(f"triggered status update for handle {self._handle}")

        if self._events is None:
            logger.debug(
                f"looking for event log for handle {self._handle} at {self._event_log_path}"
            )
            self._events = htcondor.JobEventLog(
                self._event_log_path.as_posix()).events(0)
            logger.debug(
                f"initialized event log reader for handle {self._handle}, targeting {self._event_log_path}"
            )

        for event in self._events:
            if event.cluster != self._clusterid:
                continue

            new_status = JOB_EVENT_STATUS_TRANSITIONS.get(event.type, None)
            if new_status is not None:
                key = event.proc - self._offset

                # update counts
                old_status = self._data[key]
                self._counts[old_status] -= 1
                self._counts[new_status] += 1

                # set new status on individual job
                self._data[key] = new_status

        logger.debug(f"new status counts for {self._handle}: {self._counts}")
Esempio n. 5
0
    def wait_log(self, ulog):
        """ Wait for a job to finish """

        data = {}
        if StrictVersion(VERSION) < StrictVersion('8.7.10'):

            fp = open(ulog)
            events = htcondor.read_events(fp)
            while True:
                try:
                    r = events.next()
                except StopIteration:
                    log.debug("No Event but stopiter")
                    time.sleep(2.2)
                else:
                    self.process_event(r, data)
                    log.debug(data)
                    if self._is_terminal(data):
                        break
        else:
            for r in htcondor.JobEventLog(ulog).events(None):
                self.process_event(r, data)
                log.debug(data)
                if self._is_terminal(data):
                    break
        log.debug("all jobs terminal")
Esempio n. 6
0
    def test_correct_events_read(self, logfile):
        count = 0
        jel = htcondor.JobEventLog(logfile)
        for event in jel.events(stop_after=0):
            assert (compareEvent(event, count))
            count += 1

        assert (count == 39)
 def test_submit_success(self, test_dir, submit_success):
     assert submit_success.stderr == "Job 1 was submitted."
     jel = htcondor.JobEventLog((test_dir / "helloworld.log").as_posix())
     # Wait for the job to finish by watching its event log
     for event in jel.events(stop_after=None):
         if event.type == htcondor.JobEventType.JOB_TERMINATED:
             break
     assert Path(test_dir / "helloworld.out").read_text() == "Hello, World!\n"
Esempio n. 8
0
 def test_enter_and_exit(self, logfile):
     with htcondor.JobEventLog(logfile) as jel:
         for i in range(0, 30):
             event = next(jel)
     try:
         event = next(jel)
         assert (False)
     except StopIteration as si:
         pass
Esempio n. 9
0
 def test_close(self, logfile):
     with htcondor.JobEventLog(logfile) as jel:
         e = next(jel)
         jel.close()
         try:
             e = next(jel)
             assert (False)
         except StopIteration as si:
             pass
Esempio n. 10
0
def write_and_read_back_event(event):
    with open('test_toe_exit_info.event', mode="w") as f:
        f.write(event)
        f.write('...\n')

    jel = htcondor.JobEventLog('test_toe_exit_info.event')
    os.unlink('test_toe_exit_info.event')
    for e in jel.events(stop_after=0):
        return (str(e))
Esempio n. 11
0
    def read_events(self):
        if self._event_reader is None:
            self._event_reader = htcondor.JobEventLog(
                self._event_log_path.as_posix()).events(0)

        for event in self._event_reader:
            if event.cluster != self._clusterid:
                continue

            self.events.append(event)
            yield event
Esempio n. 12
0
    def watch_events(self) -> None:
        if self.events is None:
            self.events = htcondor.JobEventLog(self.event_log.as_posix())

        for event in self.events:
            text = str(event).rstrip()
            click.secho(text,
                        err=True,
                        fg=JOB_EVENT_TO_COLOR.get(event.type, "white"))
            if event.type in BREAK_ON_JOB_EVENTS:
                break
Esempio n. 13
0
def equal_priority_execute_events(submit_equal_priority_jobs):
    """
    Simple approach to retrieving execute events. Open the job event log,
    iterate over the events in order and add all execute events to a list.
    """
    jel = htcondor.JobEventLog("scheduler_priority-equal.log")
    execute_events = []
    for event in jel.events(0):
        if event.type == htcondor.JobEventType.EXECUTE:
            execute_events.append(event)
    return execute_events
Esempio n. 14
0
    def read_events(self) -> Iterator[htcondor.JobEvent]:
        """Yield all un-read events in the event log."""
        if self._event_reader is None:
            self._event_reader = htcondor.JobEventLog(
                self._event_log_path.as_posix()).events(0)

        for event in self._event_reader:
            if event.cluster != self._clusterid:
                continue

            self.events.append(event)
            yield event
Esempio n. 15
0
    def watch_events(self) -> None:
        if self.events is None:
            self.events = htcondor.JobEventLog(self.event_log.as_posix())

        for event in self.events:
            text = str(event).rstrip()
            if event.type in (htcondor.JobEventType.JOB_HELD,
                              htcondor.JobEventType.JOB_TERMINATED):
                click.secho(text, err=True, fg="red")
            elif event.type is htcondor.JobEventType.JOB_ABORTED:
                click.secho(text, err=True, fg="white")
                break
            else:
                click.secho(text, err=True, fg="white")
Esempio n. 16
0
def parseCondorLog(cacheDoc):
    """
    do all real work and update checkpoints, nodes and nodemap dictionaries
    takes as input a cacheDoc dictionary with keys
      jobLogCheckpoint, fjrParseResCheckpoint, nodes, nodeMap
    and returns the same dictionary with updated information
    """

    jobLogCheckpoint = cacheDoc['jobLogCheckpoint']
    fjrParseResCheckpoint = cacheDoc['fjrParseResCheckpoint']
    nodes = cacheDoc['nodes']
    nodeMap = cacheDoc['nodeMap']
    if jobLogCheckpoint:
        # resume log parsing where we left
        with open((LOG_PARSING_POINTERS_DIR + jobLogCheckpoint), 'r') as f:
            jel = pickle.load(f)
    else:
        # parse log from beginning
        jel = htcondor.JobEventLog('job_log')

    parseJobLog(jel, nodes, nodeMap)
    # save jel object in a pickle file made unique by a timestamp
    newJelPickleName = 'jel-%d.pkl' % int(time.time())
    if not os.path.exists(LOG_PARSING_POINTERS_DIR):
        os.mkdir(LOG_PARSING_POINTERS_DIR)
    with open((LOG_PARSING_POINTERS_DIR + newJelPickleName), 'w') as f:
        pickle.dump(jel, f)
    newJobLogCheckpoint = newJelPickleName

    for fn in glob.glob("node_state*"):
        level = re.match(r'(\w+)(?:.(\w+))?', fn).group(2)
        with open(fn, 'r') as nodeState:
            parseNodeStateV2(nodeState, nodes, level)

    try:
        errorSummary, newFjrParseResCheckpoint = summarizeFjrParseResults(
            fjrParseResCheckpoint)
        if errorSummary and newFjrParseResCheckpoint:
            parseErrorReport(errorSummary, nodes)
    except IOError:
        logging.exception("error during error_summary file handling")

    # collect all cache info in a single dictionary and return it to called
    newCacheDoc = {}
    newCacheDoc['jobLogCheckpoint'] = newJobLogCheckpoint
    newCacheDoc['fjrParseResCheckpoint'] = newFjrParseResCheckpoint
    newCacheDoc['nodes'] = nodes
    newCacheDoc['nodeMap'] = nodeMap
    return newCacheDoc
Esempio n. 17
0
def successful_job_log(successful_condor, successful_beneficiary_job,
                       successful_victim_jobs, test_dir):
    bID = str(successful_beneficiary_job.job_ids[0])
    vIDs = [str(vID) for vID in successful_victim_jobs.job_ids]
    rv = successful_condor.run_command(["condor_now", bID] + vIDs)
    assert rv.returncode == 0

    assert successful_beneficiary_job.wait(
        condition=ClusterState.all_running,
        timeout=60,
        fail_condition=ClusterState.any_held,
    )

    # This seems like something I should be able to get from the cluster handle.
    return htcondor.JobEventLog((test_dir / "cmd_now-success.log").as_posix())
Esempio n. 18
0
    def _read_events(self, timeout=0):
        if self._event_reader is None:
            self._event_reader = htcondor.JobEventLog(
                self._event_log_path.as_posix()).events(timeout)

        for event in self._event_reader.events(timeout):
            # skip the late materialization submit event
            if event.proc == -1:
                continue

            job_id = JobID(event.cluster, event.proc)

            if event.type is htcondor.JobEventType.SUBMIT:
                self._jobid_to_taskid[job_id] = uuid.UUID(
                    classad.unquote(event["LogNotes"]))

            # this lookup is safe because the SUBMIT event always comes first
            task_id = self._jobid_to_taskid[job_id]
            task = self.executor.tasks[task_id]

            if event.type is htcondor.JobEventType.JOB_HELD:
                # TODO: turn this into an appropriate exception on the future
                raise Exception("job held")

            new_status = JOB_EVENT_STATUS_TRANSITIONS.get(event.type, None)

            if new_status is not None:
                if new_status is self._task_statuses[task_id]:
                    logger.warning(
                        f"{task} of executor {self.executor} tried to transition into the state it is already in ({new_status})"
                    )
                else:
                    self._task_statuses[task_id] = new_status

                    if new_status is TaskStatus.COMPLETED:
                        x = htio.load_objects(task.output_path)
                        status = next(x)
                        output = next(x)
                        print(f"{task} finished with {status}, {output}")

                        if status == "OK":
                            task.future.set_result(output)
                        elif status == "ERR":
                            task.future.set_exception(output)
                        else:
                            raise Exception(f"bad task status {status}")
Esempio n. 19
0
    def __init__(self, event_log_paths, batch_names):
        event_readers = {}
        for event_log_path in event_log_paths:
            try:
                reader = htcondor.JobEventLog(event_log_path).events(0)
                event_readers[event_log_path] = reader
            except (OSError, IOError) as e:
                warning(
                    "Could not open event log at {} for reading, so it will be ignored. Reason: {}"
                    .format(event_log_path, e))

        self.event_readers = event_readers
        self.state = collections.defaultdict(
            lambda: collections.defaultdict(dict))

        self.batch_names = batch_names

        self.cluster_id_to_cluster = {}
Esempio n. 20
0
    def _read_events(self):
        with self._event_reader_lock:  # no thread can be in here at the same time as another
            if self._event_reader is None:
                logger.debug(
                    f"Created event log reader for map {self.map.tag}")
                self._event_reader = htcondor.JobEventLog(
                    self._event_log_path.as_posix()).events(0)

            with utils.Timer() as timer:
                handled_events = self._handle_events()

            if handled_events > 0:
                logger.debug(
                    f"Processed {handled_events} events for map {self.map.tag} (took {timer.elapsed:.6f} seconds)"
                )

                self.map._local_data = None  # invalidate cache if any events were received

                if utils.BINDINGS_VERSION_INFO >= (8, 9, 3):
                    self.save()
Esempio n. 21
0
def job_log(jobs, condor, test_dir):
    bID = str(jobs.state.by_name[JobStatus.IDLE][0])
    vIDs = [str(vID) for vID in jobs.state.by_name[JobStatus.RUNNING]]

    rv = condor.run_command(["condor_now", "--flags", "1", bID, *vIDs])
    assert rv.returncode == 0

    # Consider converting this into a wait() for these jobs to go idle, and
    # then assert ordering about the eviction event in
    # jobs.event_log.events.
    jel = htcondor.JobEventLog(
        (test_dir / "condor_now_internals.log").as_posix())

    num_evicted = 0
    for e in jel.events(60):
        if e.type == htcondor.JobEventType.JOB_EVICTED and e.cluster == jobs.clusterid:
            num_evicted += 1
            if num_evicted == len(vIDs):
                break
    assert num_evicted == len(vIDs)

    return jel
Esempio n. 22
0
def main(argv):
    jel = htcondor.JobEventLog(argv[1])
    if not jel.isInitialized():
        print "Failed to find job event log {0}".format(argv[1])
        exit(1)

    if (str(jel) != str(iter(jel))):
        print("jel != iter(jel)")
        exit(2)

    if (str(jel) != str(jel.follow())):
        print("jel != jel.follow()")
        exit(3)

    if (str(jel) != str(jel.follow(100))):
        print("jel != jel.follow( 100 )")
        exit(3)

    for event in jel.follow(1000):
        print "Found event of type {0}".format(event.type)
        if event.type != htcondor.JobEventType.NONE:
            print "... for job {0}".format(event.Cluster)
Esempio n. 23
0
    def test_status_no_error_no_parallel(self):
        sched = CondorScheduler()

        log_name = 'completed_no_error_no_parallel.submit.nodes.log'

        jel = htcondor.JobEventLog(
            os.path.join(os.path.dirname(__file__), 'data', log_name))
        events = list(jel.events(stop_after=0))

        with TemporaryDirectory() as td:
            submit_dir = os.path.join(td, 'job', 'submit')
            os.makedirs(submit_dir)
            fn = os.path.join(submit_dir, log_name)

            details = {
                'working_directory': td,
                'submit_directory': 'job/submit'
            }

            def write_next_event():
                with open(fn, "a") as f:
                    f.write(str(events.pop(0)))
                    f.write('...\n')

            write_next_event()
            self.assertEqual(sched.status(None, details),
                             (JobStatus.QUEUED, "Job is queued"))

            for _ in range(7):
                write_next_event()
                self.assertEqual(sched.status(None, details),
                                 (JobStatus.RUNNING, "Job is running"))

            write_next_event()
            self.assertEqual(
                sched.status(None, details),
                (JobStatus.COMPLETED, "All job stages finished successfully"))
Esempio n. 24
0
    def test_status_error_short(self):
        sched = CondorScheduler()

        log_name = 'error_short.submit.nodes.log'

        jel = htcondor.JobEventLog(
            os.path.join(os.path.dirname(__file__), 'data', log_name))
        events = list(jel.events(stop_after=0))

        with TemporaryDirectory() as td:
            submit_dir = os.path.join(td, 'job', 'submit')
            os.makedirs(submit_dir)
            fn = os.path.join(submit_dir, log_name)

            details = {
                'working_directory': td,
                'submit_directory': 'job/submit'
            }

            def write_next_event():
                with open(fn, "a") as f:
                    f.write(str(events.pop(0)))
                    f.write('...\n')

            write_next_event()
            self.assertEqual(sched.status(None, details),
                             (JobStatus.QUEUED, "Job is queued"))

            write_next_event()
            self.assertEqual(sched.status(None, details),
                             (JobStatus.RUNNING, "Job is running"))

            write_next_event()
            self.assertEqual(
                sched.status(None, details),
                (JobStatus.ERROR, "Job terminated with return value 1"))
Esempio n. 25
0
    def _read_events(self):
        with self._event_reader_lock:  # no thread can be in here at the same time as another
            handled_events = False

            if self._event_reader is None:
                logger.debug(
                    f'Created event log reader for map {self.map.tag}')
                self._event_reader = htcondor.JobEventLog(
                    self._event_log_path.as_posix()).events(0)

            for event in self._event_reader:
                handled_events = True

                # skip the late materialization submit event
                if event.proc == -1:
                    continue

                if event.type is htcondor.JobEventType.SUBMIT:
                    self._jobid_to_component[(event.cluster,
                                              event.proc)] = int(
                                                  event['LogNotes'])

                # this lookup is safe because the SUBMIT event always comes first
                component = self._jobid_to_component[(event.cluster,
                                                      event.proc)]

                if event.type is htcondor.JobEventType.IMAGE_SIZE:
                    self._memory_usage[component] = max(
                        self._memory_usage[component],
                        int(event.get('MemoryUsage', 0)),
                    )
                elif event.type is htcondor.JobEventType.JOB_TERMINATED:
                    self._runtime[component] = parse_runtime(
                        event['RunRemoteUsage'])
                elif event.type is htcondor.JobEventType.JOB_RELEASED:
                    self._holds.pop(component, None)
                elif event.type is htcondor.JobEventType.JOB_HELD:
                    h = holds.ComponentHold(
                        code=int(event['HoldReasonCode']),
                        reason=event.get('HoldReason', 'UNKNOWN').strip(),
                    )
                    self._holds[component] = h

                new_status = JOB_EVENT_STATUS_TRANSITIONS.get(event.type, None)

                # the component has *terminated*, but did it error?
                if new_status is ComponentStatus.COMPLETED:
                    try:
                        exec_status = self.map._peek_status(component)
                    except exceptions.OutputNotFound:
                        logger.warning(
                            f'Output was not found for component {component} for map {self.map.tag}, marking as errored'
                        )
                        exec_status = 'ERR'

                    if exec_status == 'ERR':
                        new_status = ComponentStatus.ERRORED

                if new_status is not None:
                    if new_status is self._component_statuses[component]:
                        logger.warning(
                            f'Component {component} of map {self.map.tag} tried to transition into the state it is already in ({new_status})'
                        )
                    else:
                        # this log is commented-out because its very verbose
                        # might be helpful when debugging
                        # logger.debug(f'Component {component} of map {self.map.tag} changed state: {self._component_statuses[component]} -> {new_status}')
                        self._component_statuses[component] = new_status

            if handled_events:
                self.map._local_data = None  # invalidate cache if any events were received

                if utils.HTCONDOR_VERSION_INFO >= (8, 9, 3):
                    self.save()
Esempio n. 26
0
def storeNodesInfoInFile():
    """
    Open cache file and get the location until which the jobs_log was parsed last time
    :return: nothing
    """
    jobLogCheckpoint = None
    try:
        if os.path.exists(
                STATUS_CACHE_FILE) and os.stat(STATUS_CACHE_FILE).st_size > 0:
            logging.debug("cache file found, opening and reading")
            nodesStorage = open(STATUS_CACHE_FILE, "r")

            jobLogCheckpoint = nodesStorage.readline().strip()
            fjrParseResCheckpoint = int(nodesStorage.readline())
            nodes = ast.literal_eval(nodesStorage.readline())
            nodeMap = ast.literal_eval(nodesStorage.readline())
            nodesStorage.close()
        else:
            logging.debug("cache file not found, creating")
            jobLogCheckpoint = None
            fjrParseResCheckpoint = 0
            nodes = {}
            nodeMap = {}
    except Exception:
        logging.exception("error during status_cache handling")

    if jobLogCheckpoint:
        with open((LOG_PARSING_POINTERS_DIR + jobLogCheckpoint), 'r') as f:
            jel = pickle.load(f)
    else:
        jel = htcondor.JobEventLog('job_log')
    #jobsLog = open("job_log", "r")
    #jobsLog.seek(jobLogCheckpoint)

    parseJobLog(jel, nodes, nodeMap)
    # save jel object in a pickle file made unique by a timestamp
    newJelPickleName = 'jel-%d.pkl' % int(time.time())
    if not os.path.exists(LOG_PARSING_POINTERS_DIR):
        os.mkdir(LOG_PARSING_POINTERS_DIR)
    with open((LOG_PARSING_POINTERS_DIR + newJelPickleName), 'w') as f:
        pickle.dump(jel, f)
    newJobLogCheckpoint = newJelPickleName

    for fn in glob.glob("node_state*"):
        level = re.match(r'(\w+)(?:.(\w+))?', fn).group(2)
        with open(fn, 'r') as nodeState:
            parseNodeStateV2(nodeState, nodes, level)

    try:
        errorSummary, newFjrParseResCheckpoint = summarizeFjrParseResults(
            fjrParseResCheckpoint)
        if errorSummary and newFjrParseResCheckpoint:
            parseErrorReport(errorSummary, nodes)
    except IOError:
        logging.exception("error during error_summary file handling")

    # First write the new cache file under a temporary name, so that other processes
    # don't get an incomplete result. Then replace the old one with the new one.
    tempFilename = (STATUS_CACHE_FILE + ".%s") % os.getpid()

    nodesStorage = open(tempFilename, "w")
    nodesStorage.write(str(newJobLogCheckpoint) + "\n")
    nodesStorage.write(str(newFjrParseResCheckpoint) + "\n")
    nodesStorage.write(str(nodes) + "\n")
    nodesStorage.write(str(nodeMap) + "\n")
    nodesStorage.close()

    move(tempFilename, STATUS_CACHE_FILE)
Esempio n. 27
0
def finished_inline_jobid(inline_dag_job, dag_dir):
    jel = htcondor.JobEventLog(str(dag_dir / "inline.dag.nodes.log"))
    for event in jel.events(0):
        if event.type == htcondor.JobEventType.SUBMIT:
            return JobID.from_job_event(event)
    assert False
Esempio n. 28
0
    def status(self, job_id, details):
        """
        Get the status of a job by scheduler id

        :param job_id: The scheduler job id to check the status of
        :param details: The internal job details object
        :return: A tuple with JobStatus, additional info as a string. None if no job status could be obtained
        """

        p = Path(details['working_directory']) / details['submit_directory']

        print(f"Trying to get status of job with working directory {p}...")

        log_file = list(p.glob('*.submit.nodes.log'))

        if len(log_file) != 1:
            print(
                f"The number of .submit.nodes.log files was not 1 as expected, it was {len(log_file)}"
            )
            return None, None

        log_file = log_file[0]

        # Parse the log event log with condor and get a reverse chronological list of events
        jel = htcondor.JobEventLog(str(log_file))
        events = list(jel.events(stop_after=0))
        events.reverse()

        # Find the most recent submit event and parse the log notes to find which job stage the submit
        # is for
        submit_event = list(
            filter(lambda x: x.type == htcondor.JobEventType.SUBMIT,
                   events))[0]
        notes = submit_event['LogNotes']
        stage = list(
            filter(lambda x: x.startswith("DAG Node:"), notes.splitlines()))

        # There should be exactly one stage found, which is the name of the job dag for the submitted job
        if len(stage) != 1:
            print(
                "No DAG Node could be found for the most recent job submission"
            )
            return None, None

        stage = stage[0]

        # Get the most recent event and determine the job state
        event = events[0]

        if event.type == htcondor.JobEventType.SUBMIT:
            # The only time a job can be queued is when the most recent job that was submitted was the
            # generation stage, otherwise SUBMIT indicates the job is running
            if stage.endswith('_generation_arg_0'):
                return JobStatus.QUEUED, "Job is queued"
            else:
                return JobStatus.RUNNING, "Job is running"

        if event.type == htcondor.JobEventType.EXECUTE:
            # EXECUTE is self explanitory.
            return JobStatus.RUNNING, "Job is running"

        if event.type == htcondor.JobEventType.JOB_TERMINATED:
            # Jobs that terminate normally and have a return value of 0 completed successfully, otherwise
            # some error has occurred
            if (event["TerminatedNormally"]):
                if event['ReturnValue'] != 0:
                    return JobStatus.ERROR, f"Job terminated with return value {event['ReturnValue']}"

                # Completion status can only be reported if the current job stage is plotting, otherwise
                # job should continue in running state
                if stage.endswith('_plot_arg_0'):
                    return JobStatus.COMPLETED, "All job stages finished successfully"
                else:
                    return JobStatus.RUNNING, "Job is running"
            else:
                # ???
                return JobStatus.ERROR, "Job terminated abnormally"

        # Bilby jobs may be evicted, which is ok. Bilby jobs which are evicted will resubmit via signal
        # and continue. Held/released jobs are also part of the internal eviction/resubmit process
        if event.type in [
                htcondor.JobEventType.JOB_EVICTED,
                htcondor.JobEventType.JOB_HELD,
                htcondor.JobEventType.JOB_RELEASED
        ]:
            return JobStatus.RUNNING, "Job is running"

        # If the job has been aborted, it's probably been cancelled - mark it as such
        if event.type == htcondor.JobEventType.JOB_ABORTED:
            return JobStatus.CANCELLED, "Job has been aborted"

        print(
            f"Unexpected job event {event.type}! for working directory {details['working_directory']}"
        )

        return None, None
Esempio n. 29
0
def storeNodesInfoInFile():
    """
    Open cache file and get the location until which the jobs_log was parsed last time
    returns: a dictionary with keys: jobLogCheckpoint, fjrParseResCheckpoint, nodes, nodeMap
    """
    jobLogCheckpoint = None
    if os.path.exists(
            STATUS_CACHE_FILE) and os.stat(STATUS_CACHE_FILE).st_size > 0:
        logging.debug("cache file found, opening")
        try:
            nodesStorage = open(STATUS_CACHE_FILE, "r")
            jobLogCheckpoint = nodesStorage.readline().strip()
            if jobLogCheckpoint.startswith('#'):
                logging.debug("cache file contains initial comments, skipping")
                # comment line indicates a place-holder file created at DAG bootstrap time
                jobLogCheckpoint = None
            else:
                logging.debug("reading cache file")
                fjrParseResCheckpoint = int(nodesStorage.readline())
                nodes = ast.literal_eval(nodesStorage.readline())
                nodeMap = ast.literal_eval(nodesStorage.readline())
                nodesStorage.close()
        except Exception:
            logging.exception("error during status_cache handling")
            jobLogCheckpoint = None

    if not jobLogCheckpoint:
        logging.debug("no usable cache file found, creating")
        fjrParseResCheckpoint = 0
        nodes = {}
        nodeMap = {}

    if jobLogCheckpoint:
        # resume log parsing where we left
        with open((LOG_PARSING_POINTERS_DIR + jobLogCheckpoint), 'r') as f:
            jel = pickle.load(f)
    else:
        # parse log from beginning
        jel = htcondor.JobEventLog('job_log')
    #jobsLog = open("job_log", "r")
    #jobsLog.seek(jobLogCheckpoint)

    parseJobLog(jel, nodes, nodeMap)
    # save jel object in a pickle file made unique by a timestamp
    newJelPickleName = 'jel-%d.pkl' % int(time.time())
    if not os.path.exists(LOG_PARSING_POINTERS_DIR):
        os.mkdir(LOG_PARSING_POINTERS_DIR)
    with open((LOG_PARSING_POINTERS_DIR + newJelPickleName), 'w') as f:
        pickle.dump(jel, f)
    newJobLogCheckpoint = newJelPickleName

    for fn in glob.glob("node_state*"):
        level = re.match(r'(\w+)(?:.(\w+))?', fn).group(2)
        with open(fn, 'r') as nodeState:
            parseNodeStateV2(nodeState, nodes, level)

    try:
        errorSummary, newFjrParseResCheckpoint = summarizeFjrParseResults(
            fjrParseResCheckpoint)
        if errorSummary and newFjrParseResCheckpoint:
            parseErrorReport(errorSummary, nodes)
    except IOError:
        logging.exception("error during error_summary file handling")

    # First write the new cache file under a temporary name, so that other processes
    # don't get an incomplete result. Then replace the old one with the new one.
    tempFilename = (STATUS_CACHE_FILE + ".%s") % os.getpid()

    nodesStorage = open(tempFilename, "w")
    nodesStorage.write(str(newJobLogCheckpoint) + "\n")
    nodesStorage.write(str(newFjrParseResCheckpoint) + "\n")
    nodesStorage.write(str(nodes) + "\n")
    nodesStorage.write(str(nodeMap) + "\n")
    nodesStorage.close()

    move(tempFilename, STATUS_CACHE_FILE)

    # collect all cache info in a single dictionary and return it to called
    cacheDoc = {}
    cacheDoc['jobLogCheckpoint'] = newJobLogCheckpoint
    cacheDoc['fjrParseResCheckpoint'] = newFjrParseResCheckpoint
    cacheDoc['nodes'] = nodes
    cacheDoc['nodeMap'] = nodeMap
    return cacheDoc
Esempio n. 30
0
def synthetic(logfile):
    new = ""
    jel = htcondor.JobEventLog(logfile)
    for event in jel.events(stop_after=0):
        new = new + str(event) + "...\n"
    return new