def parseJobLog(fp, nodes, node_map): count = 0 for event in HTCondorUtils.readEvents(fp): count += 1 eventtime = time.mktime(time.strptime(event["EventTime"], "%Y-%m-%dT%H:%M:%S")) if event["MyType"] == "SubmitEvent": m = node_name_re.match(event["LogNotes"]) if m: node = m.groups()[0] proc = event["Cluster"], event["Proc"] info = nodes.setdefault(node, NODE_DEFAULTS) info["State"] = "idle" info["JobIds"].append("%d.%d" % proc) info["RecordedSite"] = False info["SubmitTimes"].append(eventtime) info["TotalUserCpuTimeHistory"].append(0) info["TotalSysCpuTimeHistory"].append(0) info["WallDurations"].append(0) info["ResidentSetSize"].append(0) info["Retries"] = len(info["SubmitTimes"]) - 1 node_map[proc] = node elif event["MyType"] == "ExecuteEvent": node = node_map[event["Cluster"], event["Proc"]] nodes[node]["StartTimes"].append(eventtime) nodes[node]["State"] = "running" nodes[node]["RecordedSite"] = False elif event["MyType"] == "JobTerminatedEvent": node = node_map[event["Cluster"], event["Proc"]] nodes[node]["EndTimes"].append(eventtime) nodes[node]["WallDurations"][-1] = nodes[node]["EndTimes"][-1] - nodes[node]["StartTimes"][-1] insertCpu(event, nodes[node]) if event["TerminatedNormally"]: if event["ReturnValue"] == 0: nodes[node]["State"] = "transferring" else: nodes[node]["State"] = "cooloff" else: nodes[node]["State"] = "cooloff" elif event["MyType"] == "PostScriptTerminatedEvent": m = node_name2_re.match(event["DAGNodeName"]) if m: node = m.groups()[0] if event["TerminatedNormally"]: if event["ReturnValue"] == 0: nodes[node]["State"] = "finished" elif event["ReturnValue"] == 2: nodes[node]["State"] = "failed" else: nodes[node]["State"] = "cooloff" else: nodes[node]["State"] = "cooloff" elif ( event["MyType"] == "ShadowExceptionEvent" or event["MyType"] == "JobReconnectFailedEvent" or event["MyType"] == "JobEvictedEvent" ): node = node_map[event["Cluster"], event["Proc"]] if nodes[node]["State"] != "idle": nodes[node]["EndTimes"].append(eventtime) if nodes[node]["WallDurations"] and nodes[node]["EndTimes"] and nodes[node]["StartTimes"]: nodes[node]["WallDurations"][-1] = nodes[node]["EndTimes"][-1] - nodes[node]["StartTimes"][-1] nodes[node]["State"] = "idle" insertCpu(event, nodes[node]) nodes[node]["TotalUserCpuTimeHistory"].append(0) nodes[node]["TotalSysCpuTimeHistory"].append(0) nodes[node]["WallDurations"].append(0) nodes[node]["ResidentSetSize"].append(0) nodes[node]["SubmitTimes"].append(-1) nodes[node]["JobIds"].append(nodes[node]["JobIds"][-1]) nodes[node]["Restarts"] += 1 elif event["MyType"] == "JobAbortedEvent": node = node_map[event["Cluster"], event["Proc"]] if nodes[node]["State"] == "idle" or nodes[node]["State"] == "held": nodes[node]["StartTimes"].append(-1) if not nodes[node]["RecordedSite"]: nodes[node]["SiteHistory"].append("Unknown") nodes[node]["State"] = "killed" insertCpu(event, nodes[node]) elif event["MyType"] == "JobHeldEvent": node = node_map[event["Cluster"], event["Proc"]] if nodes[node]["State"] == "running": nodes[node]["EndTimes"].append(eventtime) if nodes[node]["WallDurations"] and nodes[node]["EndTimes"] and nodes[node]["StartTimes"]: nodes[node]["WallDurations"][-1] = nodes[node]["EndTimes"][-1] - nodes[node]["StartTimes"][-1] insertCpu(event, nodes[node]) nodes[node]["TotalUserCpuTimeHistory"].append(0) nodes[node]["TotalSysCpuTimeHistory"].append(0) nodes[node]["WallDurations"].append(0) nodes[node]["ResidentSetSize"].append(0) nodes[node]["SubmitTimes"].append(-1) nodes[node]["JobIds"].append(nodes[node]["JobIds"][-1]) nodes[node]["Restarts"] += 1 nodes[node]["State"] = "held" elif event["MyType"] == "JobReleaseEvent": node = node_map[event["Cluster"], event["Proc"]] nodes[node]["State"] = "idle" elif event["MyType"] == "JobAdInformationEvent": node = node_map[event["Cluster"], event["Proc"]] if ( (not nodes[node]["RecordedSite"]) and ("JOBGLIDEIN_CMSSite" in event) and not event["JOBGLIDEIN_CMSSite"].startswith("$$") ): nodes[node]["SiteHistory"].append(event["JOBGLIDEIN_CMSSite"]) nodes[node]["RecordedSite"] = True insertCpu(event, nodes[node]) elif event["MyType"] == "JobImageSizeEvent": node = node_map[event["Cluster"], event["Proc"]] nodes[node]["ResidentSetSize"][-1] = int(event["ResidentSetSize"]) if nodes[node]["StartTimes"]: nodes[node]["WallDurations"][-1] = eventtime - nodes[node]["StartTimes"][-1] insertCpu(event, nodes[node]) elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent": # These events don't really affect the node status pass else: logging.warning("Unknown event type: %s" % event["MyType"]) logging.debug("There were %d events in the job log." % count) now = time.time() for node, info in nodes.items(): if node == "DagStatus": # StartTimes and WallDurations are not present, though crab status2 uses this record to get the DagStatus. continue last_start = now if info["StartTimes"]: last_start = info["StartTimes"][-1] while len(info["WallDurations"]) < len(info["SiteHistory"]): info["WallDurations"].append(now - last_start) while len(info["WallDurations"]) > len(info["SiteHistory"]): info["SiteHistory"].append("Unknown")
def parseJobLog(fp, nodes, nodeMap): count = 0 for event in HTCondorUtils.readEvents(fp): count += 1 eventtime = time.mktime( time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S")) if event['MyType'] == 'SubmitEvent': m = nodeNameRe.match(event['LogNotes']) if m: node = m.groups()[0] proc = event['Cluster'], event['Proc'] info = nodes.setdefault(node, copy.deepcopy(NODE_DEFAULTS)) info['State'] = 'idle' info['JobIds'].append("%d.%d" % proc) info['RecordedSite'] = False info['SubmitTimes'].append(eventtime) info['TotalUserCpuTimeHistory'].append(0) info['TotalSysCpuTimeHistory'].append(0) info['WallDurations'].append(0) info['ResidentSetSize'].append(0) info['Retries'] = len(info['SubmitTimes']) - 1 nodeMap[proc] = node elif event['MyType'] == 'ExecuteEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['StartTimes'].append(eventtime) nodes[node]['State'] = 'running' nodes[node]['RecordedSite'] = False elif event['MyType'] == 'JobTerminatedEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['EndTimes'].append(eventtime) # at times HTCondor does not log the ExecuteEvent and there's no StartTime if nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][ -1] - nodes[node]['StartTimes'][-1] else: nodes[node]['WallDurations'][-1] = 0 insertCpu(event, nodes[node]) if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'transferring' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'PostScriptTerminatedEvent': m = nodeName2Re.match(event['DAGNodeName']) if m: node = m.groups()[0] if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'finished' elif event['ReturnValue'] == 2: nodes[node]['State'] = 'failed' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'ShadowExceptionEvent' or event[ "MyType"] == "JobReconnectFailedEvent" or event[ 'MyType'] == 'JobEvictedEvent': node = nodeMap[event['Cluster'], event['Proc']] if nodes[node]['State'] != 'idle': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node][ 'EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][ -1] - nodes[node]['StartTimes'][-1] nodes[node]['State'] = 'idle' insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 elif event['MyType'] == 'JobAbortedEvent': node = nodeMap[event['Cluster'], event['Proc']] if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held": nodes[node]['StartTimes'].append(-1) if not nodes[node]['RecordedSite']: nodes[node]['SiteHistory'].append("Unknown") if nodes[node]['State'] == 'running': nodes[node]['EndTimes'].append(eventtime) # nodes[node]['State'] can be 'running' only if an ExcuteEvent was found, so StartTime must be defined nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][ -1] - nodes[node]['StartTimes'][-1] nodes[node]['State'] = 'killed' insertCpu(event, nodes[node]) elif event['MyType'] == 'JobHeldEvent': node = nodeMap[event['Cluster'], event['Proc']] if nodes[node]['State'] == 'running': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node][ 'EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][ -1] - nodes[node]['StartTimes'][-1] insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 nodes[node]['State'] = 'held' elif event['MyType'] == 'JobReleaseEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['State'] = 'idle' elif event['MyType'] == 'JobAdInformationEvent': node = nodeMap[event['Cluster'], event['Proc']] if (not nodes[node]['RecordedSite']) and ( 'JOBGLIDEIN_CMSSite' in event ) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"): nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite']) nodes[node]['RecordedSite'] = True insertCpu(event, nodes[node]) elif event['MyType'] == 'JobImageSizeEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize']) if nodes[node]['StartTimes']: nodes[node]['WallDurations'][ -1] = eventtime - nodes[node]['StartTimes'][-1] insertCpu(event, nodes[node]) elif event["MyType"] == "JobDisconnectedEvent" or event[ "MyType"] == "JobReconnectedEvent": # These events don't really affect the node status pass else: logging.warning("Unknown event type: %s", event['MyType']) logging.debug("There were %d events in the job log.", count) now = time.time() for node, info in nodes.items(): if node == 'DagStatus': # StartTimes and WallDurations are not present, though crab status2 uses this record to get the DagStatus. continue lastStart = now if info['StartTimes']: lastStart = info['StartTimes'][-1] while len(info['WallDurations']) < len(info['SiteHistory']): info['WallDurations'].append(now - lastStart) while len(info['WallDurations']) > len(info['SiteHistory']): info['SiteHistory'].append("Unknown")
def parseJobLog(fp, nodes, nodeMap): count = 0 for event in HTCondorUtils.readEvents(fp): count += 1 eventtime = time.mktime(time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S")) if event['MyType'] == 'SubmitEvent': m = nodeNameRe.match(event['LogNotes']) if m: node = m.groups()[0] proc = event['Cluster'], event['Proc'] info = nodes.setdefault(node, copy.deepcopy(NODE_DEFAULTS)) info['State'] = 'idle' info['JobIds'].append("%d.%d" % proc) info['RecordedSite'] = False info['SubmitTimes'].append(eventtime) info['TotalUserCpuTimeHistory'].append(0) info['TotalSysCpuTimeHistory'].append(0) info['WallDurations'].append(0) info['ResidentSetSize'].append(0) info['Retries'] = len(info['SubmitTimes'])-1 nodeMap[proc] = node elif event['MyType'] == 'ExecuteEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['StartTimes'].append(eventtime) nodes[node]['State'] = 'running' nodes[node]['RecordedSite'] = False elif event['MyType'] == 'JobTerminatedEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['EndTimes'].append(eventtime) # at times HTCondor does not log the ExecuteEvent and there's no StartTime if nodes[node]['StartTimes'] : nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] else: nodes[node]['WallDurations'][-1] = 0 insertCpu(event, nodes[node]) if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'transferring' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'PostScriptTerminatedEvent': m = nodeName2Re.match(event['DAGNodeName']) if m: node = m.groups()[0] if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'finished' elif event['ReturnValue'] == 2: nodes[node]['State'] = 'failed' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'ShadowExceptionEvent' or event["MyType"] == "JobReconnectFailedEvent" or event['MyType'] == 'JobEvictedEvent': node = nodeMap[event['Cluster'], event['Proc']] if nodes[node]['State'] != 'idle': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] nodes[node]['State'] = 'idle' insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 elif event['MyType'] == 'JobAbortedEvent': node = nodeMap[event['Cluster'], event['Proc']] if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held": nodes[node]['StartTimes'].append(-1) if not nodes[node]['RecordedSite']: nodes[node]['SiteHistory'].append("Unknown") nodes[node]['State'] = 'killed' insertCpu(event, nodes[node]) elif event['MyType'] == 'JobHeldEvent': node = nodeMap[event['Cluster'], event['Proc']] if nodes[node]['State'] == 'running': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 nodes[node]['State'] = 'held' elif event['MyType'] == 'JobReleaseEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['State'] = 'idle' elif event['MyType'] == 'JobAdInformationEvent': node = nodeMap[event['Cluster'], event['Proc']] if (not nodes[node]['RecordedSite']) and ('JOBGLIDEIN_CMSSite' in event) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"): nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite']) nodes[node]['RecordedSite'] = True insertCpu(event, nodes[node]) elif event['MyType'] == 'JobImageSizeEvent': node = nodeMap[event['Cluster'], event['Proc']] nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize']) if nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = eventtime - nodes[node]['StartTimes'][-1] insertCpu(event, nodes[node]) elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent": # These events don't really affect the node status pass else: logging.warning("Unknown event type: %s", event['MyType']) logging.debug("There were %d events in the job log.", count) now = time.time() for node, info in nodes.items(): if node == 'DagStatus': # StartTimes and WallDurations are not present, though crab status2 uses this record to get the DagStatus. continue lastStart = now if info['StartTimes']: lastStart = info['StartTimes'][-1] while len(info['WallDurations']) < len(info['SiteHistory']): info['WallDurations'].append(now - lastStart) while len(info['WallDurations']) > len(info['SiteHistory']): info['SiteHistory'].append("Unknown")
def parseJobLog(self, fp, nodes): node_map = {} count = 0 for event in HTCondorUtils.readEvents(fp): count += 1 eventtime = time.mktime(time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S")) if event['MyType'] == 'SubmitEvent': m = self.node_name_re.match(event['LogNotes']) if m: node = m.groups()[0] proc = event['Cluster'], event['Proc'] info = nodes.setdefault(node, {'Retries': 0, 'Restarts': 0, 'SiteHistory': [], 'ResidentSetSize': [], 'SubmitTimes': [], 'StartTimes': [], 'EndTimes': [], 'TotalUserCpuTimeHistory': [], 'TotalSysCpuTimeHistory': [], 'WallDurations': [], 'JobIds': []}) info['State'] = 'idle' info['JobIds'].append("%d.%d" % proc) info['RecordedSite'] = False info['SubmitTimes'].append(eventtime) info['TotalUserCpuTimeHistory'].append(0) info['TotalSysCpuTimeHistory'].append(0) info['WallDurations'].append(0) info['ResidentSetSize'].append(0) info['Retries'] = len(info['SubmitTimes'])-1 node_map[proc] = node elif event['MyType'] == 'ExecuteEvent': node = node_map[event['Cluster'], event['Proc']] nodes[node]['StartTimes'].append(eventtime) nodes[node]['State'] = 'running' nodes[node]['RecordedSite'] = False elif event['MyType'] == 'JobTerminatedEvent': node = node_map[event['Cluster'], event['Proc']] nodes[node]['EndTimes'].append(eventtime) nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] self.insertCpu(event, nodes[node]) if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'transferring' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'PostScriptTerminatedEvent': m = self.node_name2_re.match(event['DAGNodeName']) if m: node = m.groups()[0] if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'finished' elif event['ReturnValue'] == 2: nodes[node]['State'] = 'failed' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'ShadowExceptionEvent' or event["MyType"] == "JobReconnectFailedEvent" or event['MyType'] == 'JobEvictedEvent': node = node_map[event['Cluster'], event['Proc']] if nodes[node]['State'] != 'idle': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] nodes[node]['State'] = 'idle' self.insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 elif event['MyType'] == 'JobAbortedEvent': node = node_map[event['Cluster'], event['Proc']] if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held": nodes[node]['StartTimes'].append(-1) if not nodes[node]['RecordedSite']: nodes[node]['SiteHistory'].append("Unknown") nodes[node]['State'] = 'killed' self.insertCpu(event, nodes[node]) elif event['MyType'] == 'JobHeldEvent': node = node_map[event['Cluster'], event['Proc']] if nodes[node]['State'] == 'running': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] self.insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 nodes[node]['State'] = 'held' elif event['MyType'] == 'JobReleaseEvent': node = node_map[event['Cluster'], event['Proc']] nodes[node]['State'] = 'idle' elif event['MyType'] == 'JobAdInformationEvent': node = node_map[event['Cluster'], event['Proc']] if (not nodes[node]['RecordedSite']) and ('JOBGLIDEIN_CMSSite' in event) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"): nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite']) nodes[node]['RecordedSite'] = True self.insertCpu(event, nodes[node]) elif event['MyType'] == 'JobImageSizeEvent': nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize']) if nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = eventtime - nodes[node]['StartTimes'][-1] self.insertCpu(event, nodes[node]) elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent": # These events don't really affect the node status pass else: self.logger.warning("Unknown event type: %s" % event['MyType']) self.logger.debug("There were %d events in the job log." % count) now = time.time() for node, info in nodes.items(): last_start = now if info['StartTimes']: last_start = info['StartTimes'][-1] while len(info['WallDurations']) < len(info['SiteHistory']): info['WallDurations'].append(now - last_start) while len(info['WallDurations']) > len(info['SiteHistory']): info['SiteHistory'].append("Unknown")
def parseJobLog(self, fp, nodes): node_map = {} count = 0 for event in HTCondorUtils.readEvents(fp): count += 1 eventtime = time.mktime(time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S")) if event['MyType'] == 'SubmitEvent': m = self.node_name_re.match(event['LogNotes']) if m: node = m.groups()[0] proc = event['Cluster'], event['Proc'] info = nodes.setdefault(node, {'Retries': 0, 'Restarts': 0, 'SiteHistory': [], 'ResidentSetSize': [], 'SubmitTimes': [], 'StartTimes': [], 'EndTimes': [], 'TotalUserCpuTimeHistory': [], 'TotalSysCpuTimeHistory': [], 'WallDurations': [], 'JobIds': []}) info['State'] = 'idle' info['JobIds'].append("%d.%d" % proc) info['RecordedSite'] = False info['SubmitTimes'].append(eventtime) info['TotalUserCpuTimeHistory'].append(0) info['TotalSysCpuTimeHistory'].append(0) info['WallDurations'].append(0) info['ResidentSetSize'].append(0) info['Retries'] = len(info['SubmitTimes'])-1 node_map[proc] = node elif event['MyType'] == 'ExecuteEvent': node = node_map[event['Cluster'], event['Proc']] nodes[node]['StartTimes'].append(eventtime) nodes[node]['State'] = 'running' nodes[node]['RecordedSite'] = False elif event['MyType'] == 'JobTerminatedEvent': node = node_map[event['Cluster'], event['Proc']] nodes[node]['EndTimes'].append(eventtime) nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] self.insertCpu(event, nodes[node]) if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'transferring' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'PostScriptTerminatedEvent': m = self.node_name2_re.match(event['DAGNodeName']) if m: node = m.groups()[0] if event['TerminatedNormally']: if event['ReturnValue'] == 0: nodes[node]['State'] = 'finished' elif event['ReturnValue'] == 2: nodes[node]['State'] = 'failed' else: nodes[node]['State'] = 'cooloff' else: nodes[node]['State'] = 'cooloff' elif event['MyType'] == 'ShadowExceptionEvent' or event["MyType"] == "JobReconnectFailedEvent" or event['MyType'] == 'JobEvictedEvent': node = node_map[event['Cluster'], event['Proc']] if nodes[node]['State'] != 'idle': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] nodes[node]['State'] = 'idle' self.insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 elif event['MyType'] == 'JobAbortedEvent': node = node_map[event['Cluster'], event['Proc']] if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held": nodes[node]['StartTimes'].append(-1) if not nodes[node]['RecordedSite']: nodes[node]['SiteHistory'].append("Unknown") nodes[node]['State'] = 'killed' self.insertCpu(event, nodes[node]) elif event['MyType'] == 'JobHeldEvent': node = node_map[event['Cluster'], event['Proc']] if nodes[node]['State'] == 'running': nodes[node]['EndTimes'].append(eventtime) if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1] self.insertCpu(event, nodes[node]) nodes[node]['TotalUserCpuTimeHistory'].append(0) nodes[node]['TotalSysCpuTimeHistory'].append(0) nodes[node]['WallDurations'].append(0) nodes[node]['ResidentSetSize'].append(0) nodes[node]['SubmitTimes'].append(-1) nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1]) nodes[node]['Restarts'] += 1 nodes[node]['State'] = 'held' elif event['MyType'] == 'JobReleaseEvent': node = node_map[event['Cluster'], event['Proc']] nodes[node]['State'] = 'idle' elif event['MyType'] == 'JobAdInformationEvent': node = node_map[event['Cluster'], event['Proc']] if (not nodes[node]['RecordedSite']) and ('JOBGLIDEIN_CMSSite' in event) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"): nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite']) nodes[node]['RecordedSite'] = True self.insertCpu(event, nodes[node]) elif event['MyType'] == 'JobImageSizeEvent': nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize']) if nodes[node]['StartTimes']: nodes[node]['WallDurations'][-1] = eventtime - nodes[node]['StartTimes'][-1] self.insertCpu(event, nodes[node]) elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent": # These events don't really affect the node status pass else: self.logger.warning("Unknown event type: %s" % event['MyType']) self.logger.debug("There were %d events in the job log." % count) now = time.time() for node, info in nodes.items(): last_start = now if info['StartTimes']: last_start = info['StartTimes'][-1] while len(info['WallDurations']) < len(info['SiteHistory']): info['WallDurations'].append(now - last_start) while len(info['WallDurations']) > len(info['SiteHistory']): info['SiteHistory'].append("Unknown")