def extractOutputFilesFromMetadata(self, arcjobid): aj = self.dbarc.getArcJobInfo(arcjobid, columns=["JobID", "appjobid"]) if not aj or 'JobID' not in aj or not aj['JobID']: self.log.error("failed to find arcjobid %s in database" % arcjobid) return {} jobid = aj['JobID'] sessionid = jobid[jobid.rfind('/') + 1:] try: jobinfo = aCTPandaJob(filename=os.path.join( self.tmpdir, sessionid, 'heartbeat.json')) metadata = getattr(jobinfo, 'xml') # travis doesn't like jobinfo.xml except Exception as x: self.log.error("%s: failed to extract metadata for arcjob %s: %s" % (aj['appjobid'], sessionid, x)) return {} try: outputfiles = json.loads(metadata) except Exception as e: self.log.error( "%s: failed to load output file info for arcjob %s: %s" % (aj['appjobid'], sessionid, str(e))) return {} surls = {} for attrs in outputfiles.values(): try: size = attrs['fsize'] adler32 = attrs['adler32'] surl = attrs['surl'] se = arc.URL(str(surl)).Host() except Exception as x: self.log.error('%s: %s' % (aj['appjobid'], x)) else: checksum = "adler32:" + adler32 if se not in surls: surls[se] = [] surls[se] += [{ "surl": surl, "fsize": size, "checksum": checksum, "arcjobid": arcjobid }] return surls
def processFailed(self, arcjobs): """ process jobs for which pilot failed (batch exit code is non-zero) """ if not arcjobs: return self.log.info("processing %d failed jobs" % len(arcjobs)) for aj in arcjobs: jobid=aj['JobID'] if not jobid: # Job was not even submitted, there is no more information self.log.warning("%s: Job has not been submitted yet so no information to report", aj['appjobid']) continue sessionid=jobid[jobid.rfind('/')+1:] date = aj['created'].strftime('%Y-%m-%d') outd = os.path.join(self.conf.get(['joblog','dir']), date, aj['siteName']) # Make sure the path to outd exists try: os.makedirs(outd, 0o755) except: pass # copy from tmp to outd. tmp dir will be cleaned in validator localdir = os.path.join(self.tmpdir, sessionid) gmlogerrors = os.path.join(localdir, "gmlog", "errors") arcjoblog = os.path.join(outd, "%s.log" % aj['appjobid']) if not os.path.exists(arcjoblog): try: shutil.copy(gmlogerrors, arcjoblog) os.chmod(arcjoblog, 0o644) except: self.log.error("Failed to copy %s" % gmlogerrors) pilotlog = aj['stdout'] if not pilotlog and os.path.exists(localdir): pilotlogs = [f for f in os.listdir(localdir)] for f in pilotlogs: if f.find('.log'): pilotlog = f if pilotlog: try: shutil.copy(os.path.join(localdir, pilotlog), os.path.join(outd, '%s.out' % aj['appjobid'])) os.chmod(os.path.join(outd, '%s.out' % aj['appjobid']), 0o644) except Exception as e: self.log.warning("%s: Failed to copy job output for %s: %s" % (aj['appjobid'], jobid, str(e))) try: smeta = json.loads(str(aj['metadata'])) except: smeta = None # fill info for the final heartbeat pupdate = aCTPandaJob() pupdate.jobId = aj['appjobid'] pupdate.state = 'failed' pupdate.siteName = aj['siteName'] pupdate.computingElement = urlparse(aj['cluster']).hostname try: pupdate.schedulerID = smeta['schedulerid'] except: pupdate.schedulerID = self.conf.get(['panda','schedulerid']) pupdate.pilotID = self.conf.get(["joblog","urlprefix"])+"/"+date+"/"+aj['siteName']+'/'+aj['appjobid']+".out|Unknown|Unknown|Unknown|Unknown" if len(aj["ExecutionNode"]) > 255: pupdate.node = aj["ExecutionNode"][:254] self.log.warning("%s: Truncating wn hostname from %s to %s" % (aj['pandaid'], aj['ExecutionNode'], pupdate.node)) else: pupdate.node = aj["ExecutionNode"] pupdate.node = aj['ExecutionNode'] pupdate.pilotLog = self.createPilotLog(outd, aj['pandaid']) pupdate.cpuConsumptionTime = aj['UsedTotalCPUTime'] pupdate.cpuConsumptionUnit = 'seconds' pupdate.cpuConversionFactor = 1 pupdate.coreCount = aj['corecount'] or 1 pupdate.pilotTiming = "0|0|%s|0" % aj['UsedTotalWallTime'] pupdate.errorCode = 9000 pupdate.errorDiag = aj['Error'] # set start/endtime if aj['EndTime']: pupdate.startTime = self.getStartTime(aj['EndTime'], aj['UsedTotalWallTime']).strftime('%Y-%m-%d %H:%M:%S') pupdate.endTime = aj['EndTime'].strftime('%Y-%m-%d %H:%M:%S') # Sanity check for efficiency > 100% cputimepercore = pupdate.cpuConsumptionTime / pupdate.coreCount if aj['UsedTotalWallTime'] < cputimepercore: self.log.warning('%s: Adjusting reported walltime %d to CPU time %d' % (aj['appjobid'], aj['UsedTotalWallTime'], cputimepercore)) pupdate.startTime = (aj['EndTime'] - datetime.timedelta(0, cputimepercore)).strftime('%Y-%m-%d %H:%M:%S') else: # Set walltime to cputime per core pupdate.startTime = self.getStartTime(datetime.datetime.utcnow(), aj['UsedTotalCPUTime'] / pupdate.coreCount).strftime('%Y-%m-%d %H:%M:%S') pupdate.endTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') # save the heartbeat file to be used by aCTAutopilot panda update try: if smeta and smeta.get('harvesteraccesspoint'): hbfile = os.path.join(smeta['harvesteraccesspoint'], 'jobReport.json') else: hbfile = os.path.join(self.tmpdir, "heartbeats", str(aj['pandaid'])+".json") pupdate.writeToFile(hbfile) except Exception as e: self.log.warning("%s: Failed to write file %s: %s" % (aj['appjobid'], hbfile, str(e)))
def copyFinishedFiles(self, arcjobid, extractmetadata): """ - if extractmetadata: (normal arc jobs, not true pilot jobs) - store heartbeat file under tmp/pickle or under harvester access point if specified - copy .job.log file to jobs/date/pandaqueue/pandaid.out - copy gmlog errors to jobs/date/pandaqueue/pandaid.log """ columns = ['JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'arcjobs.EndTime', 'ExecutionNode', 'stdout', 'fairshare', 'pandajobs.created', 'metadata'] select = "arcjobs.id=%d AND arcjobs.id=pandajobs.arcjobid" % arcjobid aj = self.dbarc.getArcJobsInfo(select, columns=columns, tables='arcjobs,pandajobs') if not aj or 'JobID' not in aj[0] or not aj[0]['JobID']: self.log.error('No JobID in arcjob %s: %s'%(str(arcjobid), str(aj))) return False aj = aj[0] jobid = aj['JobID'] sessionid = jobid[jobid.rfind('/')+1:] date = aj['created'].strftime('%Y-%m-%d') if extractmetadata: try: jobinfo = aCTPandaJob(filename=os.path.join(self.tmpdir, sessionid, 'heartbeat.json')) except Exception as x: self.log.error("%s: failed to load heartbeat file for arcjob %s: %s" %(aj['appjobid'], jobid, x)) jobinfo = aCTPandaJob(jobinfo={'jobId': aj['appjobid'], 'state': 'finished'}) # update heartbeat and dump to tmp/heartbeats jobinfo.computingElement = arc.URL(str(aj['cluster'])).Host() if hasattr(jobinfo, 'startTime') and hasattr(jobinfo, 'endTime'): # take values from the pilot jobinfo.startTime = datetime.datetime.utcfromtimestamp(jobinfo.startTime).strftime('%Y-%m-%d %H:%M:%S') jobinfo.endTime = datetime.datetime.utcfromtimestamp(jobinfo.endTime).strftime('%Y-%m-%d %H:%M:%S') else: # Use ARC values if aj['EndTime']: # datetime cannot be serialised to json so use string (for harvester) jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, aj['UsedTotalWallTime'])).strftime('%Y-%m-%d %H:%M:%S') jobinfo.endTime = aj['EndTime'].strftime('%Y-%m-%d %H:%M:%S') # Sanity check for efficiency > 100% cputimepercore = getattr(jobinfo, 'cpuConsumptionTime', 0) / getattr(jobinfo, 'coreCount', 1) if aj['UsedTotalWallTime'] < cputimepercore: self.log.warning('%s: Adjusting reported walltime %d to CPU time %d' % (aj['appjobid'], aj['UsedTotalWallTime'], cputimepercore)) jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, cputimepercore)).strftime('%Y-%m-%d %H:%M:%S') else: self.log.warning('%s: no endtime found' % aj['appjobid']) if len(aj["ExecutionNode"]) > 255: jobinfo.node = aj["ExecutionNode"][:254] self.log.warning("%s: Truncating wn hostname from %s to %s" % (aj['appjobid'], aj['ExecutionNode'], jobinfo.node)) else: jobinfo.node = aj["ExecutionNode"] try: smeta = json.loads(aj['metadata'].decode()) except: smeta = None if smeta and smeta.get('harvesteraccesspoint'): # de-serialise the metadata to json try: jobinfo.metaData = json.loads(jobinfo.metaData) except Exception as e: self.log.warning("%s: no metaData in pilot metadata: %s" % (aj['appjobid'], str(e))) jobinfo.writeToFile(os.path.join(smeta['harvesteraccesspoint'], 'jobReport.json')) else: jobinfo.writeToFile(os.path.join(self.tmpdir, "heartbeats", "%s.json" % aj['appjobid'])) # copy to joblog dir files downloaded for the job: gmlog errors and pilot log outd = os.path.join(self.conf.get(['joblog','dir']), date, aj['fairshare']) try: os.makedirs(outd, 0o755) except: pass localdir = os.path.join(self.tmpdir, sessionid) gmlogerrors = os.path.join(localdir, "gmlog", "errors") arcjoblog = os.path.join(outd, "%s.log" % aj['appjobid']) if not os.path.exists(arcjoblog): try: shutil.move(gmlogerrors, arcjoblog) os.chmod(arcjoblog, 0o644) except: self.log.error("Failed to copy %s" % gmlogerrors) pilotlog = aj['stdout'] if not pilotlog and os.path.exists(localdir): pilotlogs = [f for f in os.listdir(localdir)] for f in pilotlogs: if f.find('.log'): pilotlog = f if pilotlog: try: shutil.move(os.path.join(localdir, pilotlog), os.path.join(outd, '%s.out' % aj['appjobid'])) os.chmod(os.path.join(outd, '%s.out' % aj['appjobid']), 0o644) except Exception as e: self.log.error("Failed to copy file %s: %s" % (os.path.join(localdir,pilotlog), str(e))) return False return True
class aCTValidator(aCTATLASProcess): ''' Validate output files for finished jobs, cleanup output files for failed jobs. ''' def __init__(self): aCTATLASProcess.__init__(self) # Use production role proxy for checking and removing files # Get DN from configured proxy file cred_type = arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials) uc = arc.UserConfig(cred_type) uc.ProxyPath(str(self.arcconf.get(['voms', 'proxypath']))) cred = arc.Credential(uc) dn = cred.GetIdentityName() actp = aCTProxy(self.log) # Beware hard-coded production role proxyfile = actp.path(dn, '/atlas/Role=production') if not proxyfile: raise Exception( 'Could not find proxy with production role in proxy table') self.log.info('set proxy path to %s' % proxyfile) self.uc = arc.UserConfig(cred_type) self.uc.ProxyPath(str(proxyfile)) self.uc.UtilsDirPath(arc.UserConfig.ARCUSERDIRECTORY) # Possible file status self.ok = 0 self.retry = 1 self.failed = 2 def _extractFromSmallFiles(self, aj, filename): jobid = aj['JobID'] sessionid = jobid[jobid.rfind('/'):] localdir = str(self.arcconf.get(['tmp', 'dir'])) + sessionid smallfiles = tarfile.open(os.path.join(localdir, 'jobSmallFiles.tgz')) return smallfiles.extractfile(filename) def copyFinishedFiles(self, arcjobid, extractmetadata): """ - if extractmetadata: (normal arc jobs, not true pilot jobs) - extract panda_node_struct.pickle from jobSmallFiles.tgz and store it under tmp/pickle - extract metadata-surl.xml and update pickle. store xml under tmp/xml - copy .job.log file to jobs/date/cluster/jobid - copy gmlog dir to jobs/date/cluster/jobid """ columns = [ 'JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'EndTime', 'ExecutionNode', 'stdout' ] aj = self.dbarc.getArcJobInfo(arcjobid, columns=columns) if not aj.has_key('JobID') or not aj['JobID']: self.log.error('No JobID in arcjob %s: %s' % (str(arcjobid), str(aj))) return False jobid = aj['JobID'] sessionid = jobid[jobid.rfind('/') + 1:] date = time.strftime('%Y%m%d') cluster = arc.URL(str(jobid)).Host() if extractmetadata: try: pandapickle = self._extractFromSmallFiles( aj, "panda_node_struct.pickle") except Exception, x: self.log.error( "%s: failed to extract pickle for arcjob %s: %s" % (aj['appjobid'], sessionid, x)) pandapickle = None try: metadata = self._extractFromSmallFiles(aj, "metadata-surl.xml") except Exception, x: self.log.error( "%s: failed to extract metadata-surl.xml for arcjob %s: %s" % (aj['appjobid'], sessionid, x)) metadata = None # update pickle and dump to tmp/pickle if pandapickle: try: jobinfo = aCTPandaJob(filehandle=pandapickle) except: jobinfo = aCTPandaJob(jobinfo={ 'jobId': aj['appjobid'], 'state': 'finished' }) else: jobinfo = aCTPandaJob(jobinfo={ 'jobId': aj['appjobid'], 'state': 'finished' }) if metadata: jobinfo.xml = str(metadata.read()) jobinfo.computingElement = cluster jobinfo.schedulerID = self.conf.get(['panda', 'schedulerid']) if aj['EndTime']: jobinfo.startTime = aj['EndTime'] - datetime.timedelta( 0, aj['UsedTotalWallTime']) jobinfo.endTime = aj['EndTime'] else: self.log.warning('%s: no endtime found' % aj['appjobid']) if len(aj["ExecutionNode"]) > 255: jobinfo.node = aj["ExecutionNode"][:254] self.log.warning( "%s: Truncating wn hostname from %s to %s" % (aj['appjobid'], aj['ExecutionNode'], jobinfo.node)) else: jobinfo.node = aj["ExecutionNode"] # Add url of logs if 'pilotID' in jobinfo.dictionary().keys() and jobinfo.pilotID: t = jobinfo.pilotID.split("|") else: t = ['Unknown'] * 5 logurl = os.path.join(self.conf.get(["joblog", "urlprefix"]), date, cluster, sessionid) try: # TODO catch and handle non-ascii jobinfo.pilotID = '|'.join([logurl] + t[1:]) except: pass jobinfo.writeToFile( self.arcconf.get(['tmp', 'dir']) + "/pickle/" + aj['appjobid'] + ".pickle")
def updateEvents(self, jobs): """ Handle event service updates for finished jobs TOFIX for pilot2 """ tlist=[] for j in jobs: eventrangestoupdate = [] if j['actpandastatus'] == 'finished' \ and 'plugin=arc' in self.sites[j['siteName']]['catchall'] \ and re.search('eventService=True', j['pandajob']): # Check if we are running in harvester mode try: smeta = json.loads(str(j['metadata'])) harvesteraccesspoint = smeta.get('harvesteraccesspoint') except: harvesteraccesspoint = None if not harvesteraccesspoint and j['sendhb'] == 0: continue if not j['eventranges'] or j['eventranges'] == '[]': fname = os.path.join(self.tmpdir, "pickle", "%d.pickle" % j['pandaid']) if not os.path.exists(fname): # Jobs which were never submitted should have substatus pilot_noevents so they go to closed # Assume only ARC sites (not condor) run NG-mode ES if j['arcjobid'] == -1 or j['arcjobid'] is None: substatus = 'pilot_noevents' self.log.info('%s: Job did not run and has no eventranges to update, marking pilot_noevents' % j['pandaid']) # Jobs which ran but produced no events have pilot_failed so they go to failed else: substatus = 'pilot_failed' self.log.info('%s: Job ran but has no eventranges to update, marking failed' % j['pandaid']) jobinfo = aCTPandaJob({'jobId': j['pandaid'], 'state': 'closed', 'jobSubStatus': substatus}) # Create the empty pickle so that heartbeat code below doesn't fail if harvesteraccesspoint: jobinfo.writeToFile(os.path.join(harvesteraccesspoint, 'jobReport.json')) else: jobinfo.writeToFile(fname) continue # If zip is used we need to first send transferring heartbeat # with jobMetrics containing the zip file # In harvester mode harvester does this itself? if 'es_to_zip' in self.sites[j['siteName']]['catchall'] and not harvesteraccesspoint: try: # Load pickled information from pilot fname = os.path.join(self.tmpdir, "pickle", "%d.pickle" % j['pandaid']) jobinfo = aCTPandaJob(filename=fname) jobmetrics = {'jobMetrics': getattr(jobinfo, 'jobMetrics', '')} self.log.info('%s: Sending jobMetrics and transferring state: %s' % (j['pandaid'], jobmetrics)) except Exception as x: self.log.error('%s: No pickle info found: %s' % (j['pandaid'], x)) else: t = PandaThr(self.getPanda(j['siteName']).updateStatus, j['pandaid'], 'transferring', jobmetrics) aCTUtils.RunThreadsSplit([t], self.nthreads) # If update fails panda won't see the zip and events # will be rescheduled to another job if t.result == None or 'StatusCode' not in t.result: # Strange response from panda continue if t.result['StatusCode'][0] == '60': self.log.error('Failed to contact Panda, proxy may have expired') elif t.result['StatusCode'][0] == '30': self.log.error('Job was already killed') eventranges = j['eventranges'] eventrangeslist = json.loads(eventranges) # Get object store ID used try: objstoreID = self.sites[j['siteName']]['ddmoses'] except: self.log.warning('No ES object store defined for %s' % j['siteName']) objstoreID = None for eventrange in eventrangeslist: node = {} node['eventRangeID'] = eventrange['eventRangeID'] try: node['eventStatus'] = eventrange['status'] except: node['eventStatus'] = j['actpandastatus'] node['objstoreID'] = objstoreID eventrangestoupdate.append(node) self.log.info('%s: updating %i event ranges: %s' % (j['pandaid'], len(eventrangestoupdate), eventrangestoupdate)) if harvesteraccesspoint: self.log.info('%s: Dumping processed event ranges to %s' % (j['pandaid'], os.path.join(harvesteraccesspoint, 'worker_updateevents.json'))) harvesterdict = {j['pandaid']: eventrangestoupdate} with open(os.path.join(harvesteraccesspoint, 'worker_updateevents.json'), 'w') as f: json.dump(harvesterdict, f) else: updatenode = {'eventRanges': json.dumps(eventrangestoupdate)} t = PandaEventsThr(self.getPanda(j['siteName']).updateEventRanges, j['pandaid'], updatenode) tlist.append(t) aCTUtils.RunThreadsSplit(tlist, self.nthreads) for t in tlist: # If update fails events will be rescheduled to another job if t.result == None or 'StatusCode' not in t.result: # Strange response from panda continue if t.result['StatusCode'][0] == '60': self.log.error('Failed to contact Panda, proxy may have expired') elif t.result['StatusCode'][0] == '30': self.log.warning('%s: Job was already killed' % j['pandaid'])
def updatePandaFinishedPilot(self): """ Final status update for completed jobs (finished or failed in athena) and cancelled jobs """ jobs=self.dbpanda.getJobs("actpandastatus='finished' or actpandastatus='failed' or actpandastatus='cancelled' limit 1000") if not jobs: return self.log.info("Updating panda for %d finished jobs (%s)" % (len(jobs), ','.join([str(j['pandaid']) for j in jobs]))) self.updateEvents(jobs) tlist = [] for j in jobs: # If true pilot skip heartbeat and just update DB if not j['sendhb']: jd={} jd['pandastatus']=None jd['actpandastatus']='done' if j['actpandastatus'] == 'failed': jd['actpandastatus']='donefailed' if j['actpandastatus'] == 'cancelled': jd['actpandastatus']='donecancelled' if not j['startTime']: jd['startTime'] = datetime.datetime.utcnow() if not j['endTime']: jd['endTime'] = datetime.datetime.utcnow() self.dbpanda.updateJob(j['pandaid'], jd) continue # Cancelled jobs have no heartbeat info if j['actpandastatus'] == 'cancelled': jobinfo = aCTPandaJob(jobinfo = {'jobId': j['pandaid'], 'state': 'failed'}) jobinfo.pilotErrorCode = 1144 jobinfo.pilotErrorDiag = "This job was killed by panda server" jobinfo.startTime = j['startTime'] if j['startTime'] else datetime.datetime.utcnow() jobinfo.endTime = j['endTime'] if j['endTime'] else datetime.datetime.utcnow() else: try: # Load heartbeat information from pilot fname = os.path.join(self.tmpdir, "heartbeats", "%d.json" % j['pandaid']) jobinfo = aCTPandaJob(filename=fname) except Exception as x: self.log.error('%s: %s' % (j['pandaid'], x)) # Send some basic info back to panda info = {'jobId': j['pandaid'], 'state': j['pandastatus']} jobinfo = aCTPandaJob(jobinfo=info) jobinfo.errorCode = 9000 jobinfo.errorDiag = 'Job failed for unknown reason' else: os.remove(fname) self.log.debug('%s: final heartbeat: %s' % (j['pandaid'], jobinfo.dictionary())) t=PandaThr(self.getPanda(j['siteName']).updateStatus,j['pandaid'],j['pandastatus'],jobinfo.dictionary()) tlist.append(t) aCTUtils.RunThreadsSplit(tlist, self.nthreads) for t in tlist: if t.result == None: continue if 'StatusCode' in t.result and t.result['StatusCode'] and t.result['StatusCode'][0] != '0': self.log.error('Error updating panda') continue jd={} jd['pandastatus']=None jd['actpandastatus']='done' if t.status == 'failed': jd['actpandastatus']='donefailed' if 'pilotErrorCode' in t.args and t.args['pilotErrorCode'] == 1144: jd['actpandastatus']='donecancelled' jd['theartbeat']=self.dbpanda.getTimeStamp() self.dbpanda.updateJob(t.id,jd) # Send done message to APFMon self.apfmon.updateJob(t.id, 'done' if jd['actpandastatus'] == 'done' else 'fault') self.log.info("Threads finished") # Clean inputfiles, pickle and eventranges for j in jobs: pandaid=j['pandaid'] pandainputdir = os.path.join(self.tmpdir, 'inputfiles', str(pandaid)) picklefile = os.path.join(self.tmpdir, 'pickle', str(pandaid)+".pickle") eventrangesfile = os.path.join(self.tmpdir, 'eventranges', str(pandaid)+".json") shutil.rmtree(pandainputdir, ignore_errors=True) # remove pickle if os.path.exists(picklefile): os.unlink(picklefile) # remove eventrangesfile if os.path.exists(eventrangesfile): os.unlink(eventrangesfile)